diff --git a/src/corppa/poetry_detection/refmatcha.py b/src/corppa/poetry_detection/refmatcha.py index ee22a442..d0f0b8b8 100755 --- a/src/corppa/poetry_detection/refmatcha.py +++ b/src/corppa/poetry_detection/refmatcha.py @@ -54,6 +54,7 @@ LABELED_EXCERPT_FIELDS, fix_data_types, ) +from corppa.poetry_detection.ref_corpora import compile_metadata_df, fulltext_corpora logger = logging.getLogger(__name__) @@ -65,74 +66,31 @@ REF_DATA_DIR = pathlib.Path("poetry-reference-data") TEXT_PARQUET_FILE = REF_DATA_DIR / "poems.parquet" META_PARQUET_FILE = REF_DATA_DIR / "poem_metadata.parquet" -# csv files to supplement .txt files -POETRY_FOUNDATION_CSV = REF_DATA_DIR / "poetryfoundationdataset.csv" -CHADWYCK_HEALEY_CSV = REF_DATA_DIR / "chadwyck_healey_metadata.csv" -# define source ids to ensure we are consistent -SOURCE_ID = { - "Poetry Foundation": "poetry-foundation", - "Chadwyck-Healey": "chadwyck-healey", - "internet-poems": "internet_poems", -} def compile_text(data_dir, output_file): """Compile reference poems into a parquet file for quick identification - of poetry excerpts based on matching text. Looks for text files in - directories under `data_dir`; uses the filename stem as poem identifier - and the containing directory name as the id for the source reference corpus. - Also looks for and includes content from `poetryfoundationdataset.csv` - contained in the data directory. + of poetry excerpts based on matching text. """ - - # parquet file schema: - # - poem id - # - text of the poem - # - source (identifier for the reference corpus) - schema = pa.schema( - [("id", pa.string()), ("text", pa.string()), ("source", pa.string())] + poem_text = pl.DataFrame( + [], + schema={ + "poem_id": pl.String, + "text": pl.String, + "ref_corpus": pl.String, + }, ) - # open a parquet writer so we can add records in chunks - pqwriter = pq.ParquetWriter(output_file, schema) - - # handle files in batches - # look for .txt files in nested directories; use parent directory name as - # the reference corpus source name/id - for chunk in batched(iglob(f"{data_dir}/**/*.txt"), 1000): - chunk_files = [pathlib.Path(f) for f in chunk] - ids = [f.stem for f in chunk_files] - sources = [SOURCE_ID.get(f.parent.name, f.parent.name) for f in chunk_files] - texts = [f.open().read() for f in chunk_files] - # create and write a record batch - record_batch = pa.RecordBatch.from_arrays( - [ids, texts, sources], names=["id", "text", "source"] - ) - pqwriter.write_batch(record_batch) - - # poetry foundation text content is included in the csv file - if POETRY_FOUNDATION_CSV.exists(): - # load poetry foundation csv into a polars dataframe - # - rename columns for our use - # - add source column - # - select only the columns we want to include - pf_df = ( - pl.read_csv(POETRY_FOUNDATION_CSV) - .rename({"Poetry Foundation ID": "id", "Content": "text"}) - .with_columns(source=pl.lit(SOURCE_ID["Poetry Foundation"])) - .select(["id", "text", "source"]) - ) - # convert polars dataframe to arrow table, cast to our schema to - # align types (large string vs string), then write out in batches - for batch in pf_df.to_arrow().cast(target_schema=schema).to_batches(): - pqwriter.write_batch(batch) - else: - print( - f"Poetry Foundation csv file not found for text compilation (expected at {POETRY_FOUNDATION_CSV})", - file=sys.stderr, - ) - # close the parquet file - pqwriter.close() + # for each corpus, load poem metadata into a polars dataframe, + # rename id to poem_id, and add a column with the corpus id + for ref_corpus in fulltext_corpora(): + # NOTE: could enable progress bar here... but needs + # prefix/context + corpus_text = pl.from_dicts( + ref_corpus.get_text(disable_progress=False) + ).with_columns(ref_corpus=pl.lit(ref_corpus.corpus_id)) + poem_text.extend(corpus_text) + poem_text.write_parquet(output_file) def compile_metadata(data_dir, output_file): @@ -146,97 +104,9 @@ def compile_metadata(data_dir, output_file): # for poem dataset output, we need poem id, author, and title # to match text results, we need poem id and source id - schema = pa.schema( - [ - ("id", pa.string()), - ("source", pa.string()), - ("author", pa.string()), - ("title", pa.string()), - ] - ) - # open a parquet writer for outputting content in batches - pqwriter = pq.ParquetWriter(output_file, schema) - - # TODO: prioritize internet-poems matches over CH - - # load chadwyck healey metadata - if CHADWYCK_HEALEY_CSV.exists(): - # use polars to read in the csv and convert to the format we want - # - rename main title to title - # - add source id for all rows - # - combine author first and last name - # - reorder and limit columns to match parquet schema - df = ( - # ignore parse errors in fields we don't care about (author_dob) - pl.read_csv(CHADWYCK_HEALEY_CSV, ignore_errors=True) - .rename({"title_main": "title"}) - .with_columns(source=pl.lit(SOURCE_ID["Chadwyck-Healey"])) - .with_columns( - pl.concat_str( - [pl.col("author_firstname"), pl.col("author_lastname")], - separator=" ", - ).alias("author") - ) - .select(["id", "source", "author", "title"]) - ) - # convert polars dataframe to arrow table, cast to our schema to - # align types (large string vs string), then write out in batches - for batch in df.to_arrow().cast(target_schema=schema).to_batches(): - pqwriter.write_batch(batch) - else: - print( - f"Chadwyck-Healey csv file not found for metadata compilation (expected at {CHADWYCK_HEALEY_CSV})", - file=sys.stderr, - ) - - # for the directory of internet poems, metadata is embedded in file name - internet_poems_dir = data_dir / "internet-poems" - # this directory is a set of manually curated texts; - # currently only 112 files, so don't worry about chunking until needed - poem_files = list(internet_poems_dir.glob("*.txt")) - # use filename without .txt as poem identifier - ids = [p.stem for p in poem_files] - # filename is : Firstname-Lastname_Poem-Title.txt - # author name: filename before the _ with dashes replaced with spaces - authors = [p.stem.split("_", 1)[0].replace("-", " ") for p in poem_files] - # title: same as author for the text after the _ - titles = [p.stem.split("_", 1)[1].replace("-", " ") for p in poem_files] - source = [SOURCE_ID["internet-poems"]] * len(ids) - - # create a record batch to write out - record_batch = pa.RecordBatch.from_arrays( - [ids, source, authors, titles], names=["id", "source", "author", "title"] - ) - pqwriter.write_batch(record_batch) - - # load poetry foundation data from csv file - # do this one last since it is least preferred of our sources - if POETRY_FOUNDATION_CSV.exists(): - # use polars to read in the csv and convert to the format we want - # - rename columns to match desired output - # - add source id - # - reorder and limit columns to match parquet schema - df = ( - pl.read_csv(POETRY_FOUNDATION_CSV) - # .drop("Content", "") - .rename( - {"Author": "author", "Title": "title", "Poetry Foundation ID": "id"} - ) - .with_columns(source=pl.lit(SOURCE_ID["Poetry Foundation"])) - .select(["id", "source", "author", "title"]) - ) - # convert polars dataframe to arrow table, cast to our schema to - # align types (large string vs string), then write out in batches - for batch in df.to_arrow().cast(target_schema=schema).to_batches(): - pqwriter.write_batch(batch) - else: - print( - f"Poetry Foundation csv file not found for metadata compilation (expected at {POETRY_FOUNDATION_CSV})", - file=sys.stderr, - ) - - # close the parquet file - pqwriter.close() + meta_df = compile_metadata_df() + # TODO: add an option to request full-text corpora only? + meta_df.write_parquet(output_file) # unicode line separator; used in some internet poems text files @@ -253,6 +123,7 @@ def _text_for_search(expr): .str.replace_all(r"(\w) \| -(\w)", "$1$2") # replace other punctuation with spaces .str.replace_all("[[:punct:]]", " ") + .str.replace_all("\\*", "") # remove asterisk .str.replace_all( LINE_SEPARATOR, "\n" ) # replace unicode line separator with newline @@ -338,20 +209,6 @@ def multiple_matches(filtered_ref_df): if match_df is not None: return match_df, reason - # if author/title duplication check failed, check for author matches - # poetry foundation includes Shakespeare drama excerpts with alternate names - authordupe_df = df.filter(df.select(["_author"]).is_duplicated()) - if not authordupe_df.is_empty(): - # Shakespeare shows up oddly in poetry foundation; - # if author matches, assume the other source has the correct title - non_poetryfoundtn = authordupe_df.filter( - pl.col("source") != SOURCE_ID["Poetry Foundation"] - ) - if non_poetryfoundtn.height == 1: - match_df = non_poetryfoundtn.limit(1) - reason = "duplicate author but not title; excluding Poetry Foundation" - return match_df, reason - return None, None @@ -385,9 +242,9 @@ def identify_excerpt( ) match_info = None result = None - # preserve any notes on the incoming excerpt - # (is this what we want? notes might get duplicated if/when we merge...) - note_lines = [excerpt_row["notes"]] if excerpt.notes is not None else [] + # notes on the incoming excerpt are ignored, since we assume + # the output will be merged. + # If we preserve existing notes here they will be duplicated on merge. search_field = f"search_{search_text}" search_field_label = search_text.replace("_", " ") @@ -450,8 +307,6 @@ def identify_excerpt( # but we only return labeled excerpts, so out of scope for now if match_df is not None: - # rename columns for export - match_df = match_df.rename({"id": "poem_id", "source": "ref_corpus"}) # get the first row as a dictionary match_info = match_df.row(0, named=True) @@ -489,9 +344,7 @@ def identify_excerpt( ] # add note about how the match was determined - # return as new field; must be merged with notes in calling code - note_lines.append(f"{SCRIPT_ID}: {id_note}") - match_info["notes"] = "\n".join(note_lines).strip() + match_info["notes"] = f"{SCRIPT_ID}: {id_note}" # set id method match_info["identification_methods"] = {SCRIPT_ID} @@ -596,13 +449,13 @@ def process(input_file, output_file, recompile=False): reference_df = pl.read_parquet(TEXT_PARQUET_FILE) meta_df = pl.read_parquet(META_PARQUET_FILE) print(f"Poetry reference text data: {reference_df.height:,} entries") - print("total by source") - source_counts = reference_df["source"].value_counts() + print("total by reference corpus") + source_counts = reference_df["ref_corpus"].value_counts() for value, count in source_counts.iter_rows(): # row is a tuple of value, count print(f"\t{value}: {count:,}") - # some texts from poetry foundation and maybe Chadwyck-Healey are truncated + # some texts from Chadwyck-Healey are truncated # discard them to avoid bad partial/fuzzy matches reference_df = reference_df.with_columns(text_length=pl.col("text").str.len_chars()) min_length = 15 @@ -611,8 +464,8 @@ def process(input_file, output_file, recompile=False): print(f" Omitting {short_texts.height} poems with text length < {min_length}") print(f"Poetry reference metadata: {meta_df.height:,} entries") - print("total by source") - source_counts = meta_df["source"].value_counts() + print("total by reference corpus") + source_counts = meta_df["ref_corpus"].value_counts() for value, count in source_counts.iter_rows(): # row is a tuple of value, count print(f"\t{value}: {count:,}") @@ -621,16 +474,16 @@ def process(input_file, output_file, recompile=False): reference_df = reference_df.join( meta_df, # join on the combination of poem id and source id - on=pl.concat_str([pl.col("id"), pl.col("source")], separator="|"), + on=pl.concat_str([pl.col("poem_id"), pl.col("ref_corpus")], separator="|"), how="left", # occasionally ids do not match, # e.g. Chadwyck Healey poem id we have text for but not in metadata - ).drop("id_right", "source_right") + ).drop("poem_id_right", "ref_corpus_right") # generate a simplified text field for searching # NOTE: this part is a bit slow reference_df = generate_search_text(reference_df) - # load csv with excerpt fieldnames + # load csv with excerpt field names try: input_df = fix_data_types(pl.read_csv(input_file, columns=EXCERPT_FIELDS)) except pl.exceptions.NoDataError as err: diff --git a/test/test_poetry_detection/test_refmatcha.py b/test/test_poetry_detection/test_refmatcha.py index cca58546..bffe28f0 100644 --- a/test/test_poetry_detection/test_refmatcha.py +++ b/test/test_poetry_detection/test_refmatcha.py @@ -7,9 +7,7 @@ from corppa.poetry_detection.core import Excerpt, LabeledExcerpt from corppa.poetry_detection.refmatcha import ( - CHADWYCK_HEALEY_CSV, META_PARQUET_FILE, - POETRY_FOUNDATION_CSV, SCRIPT_ID, TEXT_PARQUET_FILE, compile_metadata, @@ -64,17 +62,17 @@ def test_searchable_text(input, expected): ref_poetry_data = [ { - "id": "Z200653845", + "poem_id": "Z200653845", "text": """By his wonderful work's we see plainly enough That the earth is the Lord's and the fullness thereof; When hungry and thirsty we're ready to faint, He seeth our need and prevents our complaint;""", - "source": "chadwyck-healey", + "ref_corpus": "chadwyck-healey", }, { - "id": "King-James-Bible_Psalms", + "poem_idid": "King-James-Bible_Psalms", "text": "He hath made his wonderful works to be remembered", - "source": "internet-poems", + "ref_corpus": "internet-poems", }, ] @@ -123,8 +121,8 @@ def test_identify_excerpt(reference_df): # single match id_result = identify_excerpt(excerpt_row, reference_df) assert isinstance(id_result, LabeledExcerpt) - assert id_result.poem_id == ref_poetry_data[0]["id"] - assert id_result.ref_corpus == ref_poetry_data[0]["source"] + assert id_result.poem_id == ref_poetry_data[0]["poem_id"] + assert id_result.ref_corpus == ref_poetry_data[0]["ref_corpus"] # these are from the searchable version of the input text assert id_result.ref_span_start == 50 assert id_result.ref_span_end == 97 @@ -137,7 +135,7 @@ def test_identify_excerpt(reference_df): "The earth is the\nLords\t\tand the fullness thereof" ) id_result = identify_excerpt(excerpt_row, reference_df) - assert id_result.poem_id == ref_poetry_data[0]["id"] + assert id_result.poem_id == ref_poetry_data[0]["poem_id"] # no match excerpt_row["search_text"] = "Disdain forbids me and my dread of shame" @@ -169,7 +167,7 @@ def test_identify_excerpt_first_line(reference_df): # single match id_result = identify_excerpt(excerpt_row, reference_df, "first_line") assert isinstance(id_result, LabeledExcerpt) - assert id_result.poem_id == ref_poetry_data[0]["id"] + assert id_result.poem_id == ref_poetry_data[0]["poem_id"] assert id_result.ref_span_start == 50 # end and text adjusted based on length of search text assert id_result.ref_span_end == 142 @@ -189,7 +187,7 @@ def test_identify_excerpt_last_line(reference_df): # single match id_result = identify_excerpt(excerpt_row, reference_df, "last_line") - assert id_result.poem_id == ref_poetry_data[0]["id"] + assert id_result.poem_id == ref_poetry_data[0]["poem_id"] # start and text adjusted based on length of search text assert id_result.ref_span_start == 50 assert id_result.ref_span_end == 142 @@ -220,8 +218,10 @@ def test_identify_excerpt_multiple(mock_multimatch, reference_df): ) mock_multimatch.return_value = (ref_match_df.limit(1), reason) id_result = identify_excerpt(excerpt_row, reference_df) - assert id_result.poem_id == ref_poetry_data[0]["id"] - assert id_result.ref_corpus == ref_poetry_data[0]["source"] + print(id_result) + print(ref_poetry_data) + assert id_result.poem_id == ref_poetry_data[0]["poem_id"] + assert id_result.ref_corpus == ref_poetry_data[0]["ref_corpus"] assert id_result.ref_span_text == "matched text" assert id_result.ref_span_start == 10 assert id_result.ref_span_end == 20 @@ -233,31 +233,35 @@ def test_identify_excerpt_multiple(mock_multimatch, reference_df): def test_multiple_matches(): # title + author match (ignore case, punctuation) reference_data = [ - {"author": "James Thomson", "title": "Winter", "source": "internet-poems"}, - {"author": "James Thomson", "title": "WINTER.", "source": "chadwyck-healey"}, + {"author": "James Thomson", "title": "Winter", "ref_corpus": "internet-poems"}, + { + "author": "James Thomson", + "title": "WINTER.", + "ref_corpus": "chadwyck-healey", + }, ] match, reason = multiple_matches(pl.from_dicts(reference_data)) assert reason == "all rows match author + title" # first match is returned - assert match["source"][0] == "internet-poems" + assert match["ref_corpus"][0] == "internet-poems" - # first source is prioritized when everything matches + # first reference corpus is prioritized when everything matches reference_data.reverse() match, reason = multiple_matches(pl.from_dicts(reference_data)) # first match is returned - assert match["source"][0] == "chadwyck-healey" + assert match["ref_corpus"][0] == "chadwyck-healey" # similar title; current logic can't match this reference_data = [ { "author": "Robert Burns", "title": "Stay My Charmer Can You Leave", - "source": "internet-poems", + "ref_corpus": "internet-poems", }, { "author": "Robert Burns", "title": "STAY, MY CHARMER", - "source": "chadwyck-healey", + "ref_corpus": "chadwyck-healey", }, ] match, reason = multiple_matches(pl.from_dicts(reference_data)) @@ -269,40 +273,22 @@ def test_multiple_matches(): { "author": "James Hogg", "title": "Mador of the Moor", - "source": "internet-poems", + "ref_corpus": "internet-poems", }, { "author": "James Hogg", "title": "The Palmers Morning Hymn", - "source": "internet-poems", + "ref_corpus": "internet-poems", }, { "author": "James Hogg", "title": "MADOR OF THE MOOR. ", - "source": "chadwyck-healey", + "ref_corpus": "chadwyck-healey", }, ] match, reason = multiple_matches(pl.from_dicts(reference_data)) assert reason == "majority match author + title (2 out of 3)" - assert match["source"][0] == "internet-poems" - - # poetry foundation title mismatch - # majority match - reference_data = [ - { - "author": "William Shakespeare", - "title": "As You Like It", - "source": "internet-poems", - }, - { - "author": "William Shakespeare", - "title": "Song: Blow, blow thou...", - "source": "poetry-foundation", - }, - ] - match, reason = multiple_matches(pl.from_dicts(reference_data)) - assert reason == "duplicate author but not title; excluding Poetry Foundation" - assert match["source"][0] == "internet-poems" + assert match["ref_corpus"][0] == "internet-poems" @pytest.fixture @@ -338,18 +324,11 @@ def chadwyck_healey_csv(tmp_path): return ch_meta_csv -@pytest.fixture -def poetry_foundation_csv(tmp_path): - "fixture to create a test version of the poetry foundation csv file" - pfound_csv = tmp_path / POETRY_FOUNDATION_CSV - pfound_csv.parent.mkdir(exist_ok=True) - pfound_csv.write_text(""",Author,Title,Poetry Foundation ID,Content -0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what I hope ...\"""") - return pfound_csv - - def test_compile_text( - tmp_path, capsys, internet_poem, chadwyck_healey_poem, poetry_foundation_csv + tmp_path, + capsys, + internet_poem, + chadwyck_healey_poem, ): os.chdir(tmp_path) # output file to be created @@ -370,21 +349,9 @@ def test_compile_text( assert text_row["id"] == internet_poem.stem assert text_row["text"].startswith("ARMA virumque cano") assert text_row["source"] == "internet_poems" - text_row = text_df.row(2, named=True) - assert text_row["id"] == "55489" - assert text_row["text"].startswith("Dear Writers, I’m compiling") - assert text_row["source"] == "poetry-foundation" - - # should get a warning if poetry foundation csv is missing - poetry_foundation_csv.unlink() - compile_text(tmp_path, text_file) - captured = capsys.readouterr() - assert "Poetry Foundation csv file not found for text compilation" in captured.err -def test_compile_metadata( - tmp_path, capsys, internet_poem, poetry_foundation_csv, chadwyck_healey_csv -): +def test_compile_metadata(tmp_path, capsys, internet_poem, chadwyck_healey_csv): os.chdir(tmp_path) # output file to be created metadata_file = tmp_path / "metadata.parquet" @@ -405,21 +372,12 @@ def test_compile_metadata( assert poem_meta["author"] == "Virgil" assert poem_meta["title"] == "Aeneid" assert poem_meta["source"] == "internet_poems" - poem_meta = meta_df.row(2, named=True) - assert poem_meta["id"] == "55489" - assert poem_meta["source"] == "poetry-foundation" - assert poem_meta["author"] == "Wendy Videlock" - assert poem_meta["title"] == "!" # when CSVs are not found, should see error messages - poetry_foundation_csv.unlink() chadwyck_healey_csv.unlink() compile_metadata(tmp_path, metadata_file) captured = capsys.readouterr() assert "Chadwyck-Healey csv file not found for metadata compilation" in captured.err - assert ( - "Poetry Foundation csv file not found for metadata compilation" in captured.err - ) def test_process( @@ -427,7 +385,6 @@ def test_process( capsys, internet_poem, chadwyck_healey_poem, - poetry_foundation_csv, chadwyck_healey_csv, ): # minimal test to run process code