You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
to using typer seems to be ignoring the max_docs argument. it is set at 10 ** 6, but i am getting a new file for each individual line when training vectors. even when adding --max-docs 1000000 to my code it is still going line by line instead of creating batches.
here is the code from 01_parse.py
def main(
# fmt: off
in_file: str = typer.Argument(..., help="Path to input file"),
out_dir: str = typer.Argument(..., help="Path to output directory"),
spacy_model: str = typer.Argument("en_core_web_sm", help="Name of spaCy model to use"),
n_process: int = typer.Option(1, "--n-process", "-n", help="Number of processes (multiprocessing)"),
max_docs: int = typer.Option(10 ** 6, "--max-docs", "-m", help="Maximum docs per batch"), <-- this isn't working it seems
# fmt: on
):
"""
Step 1: Parse raw text with spaCy
Expects an input file with one sentence per line and will output a .spacy
file of the parsed collection of Doc objects (DocBin).
"""
input_path = Path(in_file)
output_path = Path(out_dir)
if not input_path.exists():
msg.fail("Can't find input file", in_file, exits=1)
if not output_path.exists():
output_path.mkdir(parents=True)
msg.good(f"Created output directory {out_dir}")
nlp = spacy.load(spacy_model)
msg.info(f"Using spaCy model {spacy_model}")
doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
msg.text("Preprocessing text...")
count = 0
batch_num = 0
with input_path.open("r", encoding="utf8") as texts:
docs = nlp.pipe(texts, n_process=n_process)
for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
if count < max_docs:
doc_bin.add(doc)
count += 1
else:
batch_num += 1
count = 0
msg.good(f"Processed {len(doc_bin)} docs")
doc_bin_bytes = doc_bin.to_bytes()
output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
with output_file.open("wb") as f:
f.write(doc_bin_bytes)
msg.good(f"Saved parsed docs to file", output_file.resolve())
doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
with output_file.open("wb") as f:
batch_num += 1
output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
doc_bin_bytes = doc_bin.to_bytes()
f.write(doc_bin_bytes)
msg.good(
f"Complete. Saved final parsed docs to file", output_file.resolve()
)
if __name__ == "__main__":
typer.run(main)
The text was updated successfully, but these errors were encountered:
The shift from
to using typer seems to be ignoring the max_docs argument. it is set at 10 ** 6, but i am getting a new file for each individual line when training vectors. even when adding --max-docs 1000000 to my code it is still going line by line instead of creating batches.
here is the code from 01_parse.py
The text was updated successfully, but these errors were encountered: