diff --git a/src/corppa/poetry_detection/refmatcha.py b/src/corppa/poetry_detection/refmatcha.py
index ee22a442..d0f0b8b8 100755
--- a/src/corppa/poetry_detection/refmatcha.py
+++ b/src/corppa/poetry_detection/refmatcha.py
@@ -54,6 +54,7 @@
     LABELED_EXCERPT_FIELDS,
     fix_data_types,
 )
+from corppa.poetry_detection.ref_corpora import compile_metadata_df, fulltext_corpora
 
 logger = logging.getLogger(__name__)
 
@@ -65,74 +66,31 @@
 REF_DATA_DIR = pathlib.Path("poetry-reference-data")
 TEXT_PARQUET_FILE = REF_DATA_DIR / "poems.parquet"
 META_PARQUET_FILE = REF_DATA_DIR / "poem_metadata.parquet"
-# csv files to supplement .txt files
-POETRY_FOUNDATION_CSV = REF_DATA_DIR / "poetryfoundationdataset.csv"
-CHADWYCK_HEALEY_CSV = REF_DATA_DIR / "chadwyck_healey_metadata.csv"
-# define source ids to ensure we are consistent
-SOURCE_ID = {
-    "Poetry Foundation": "poetry-foundation",
-    "Chadwyck-Healey": "chadwyck-healey",
-    "internet-poems": "internet_poems",
-}
 
 
 def compile_text(data_dir, output_file):
     """Compile reference poems into a parquet file for quick identification
-    of poetry excerpts based on matching text. Looks for text files in
-    directories under `data_dir`; uses the filename stem as poem identifier
-    and the containing directory name as the id for the source reference corpus.
-    Also looks for and includes content from `poetryfoundationdataset.csv`
-    contained in the data directory.
+    of poetry excerpts based on matching text.
     """
-
-    # parquet file schema:
-    # - poem id
-    # - text of the poem
-    # - source (identifier for the reference corpus)
-    schema = pa.schema(
-        [("id", pa.string()), ("text", pa.string()), ("source", pa.string())]
+    poem_text = pl.DataFrame(
+        [],
+        schema={
+            "poem_id": pl.String,
+            "text": pl.String,
+            "ref_corpus": pl.String,
+        },
     )
-    # open a parquet writer so we can add records in chunks
-    pqwriter = pq.ParquetWriter(output_file, schema)
-
-    # handle files in batches
-    # look for .txt files in nested directories; use parent directory name as
-    # the reference corpus source name/id
-    for chunk in batched(iglob(f"{data_dir}/**/*.txt"), 1000):
-        chunk_files = [pathlib.Path(f) for f in chunk]
-        ids = [f.stem for f in chunk_files]
-        sources = [SOURCE_ID.get(f.parent.name, f.parent.name) for f in chunk_files]
-        texts = [f.open().read() for f in chunk_files]
-        # create and write a record batch
-        record_batch = pa.RecordBatch.from_arrays(
-            [ids, texts, sources], names=["id", "text", "source"]
-        )
-        pqwriter.write_batch(record_batch)
-
-    # poetry foundation text content is included in the csv file
-    if POETRY_FOUNDATION_CSV.exists():
-        # load poetry foundation csv into a polars dataframe
-        # - rename columns for our use
-        # - add source column
-        # - select only the columns we want to include
-        pf_df = (
-            pl.read_csv(POETRY_FOUNDATION_CSV)
-            .rename({"Poetry Foundation ID": "id", "Content": "text"})
-            .with_columns(source=pl.lit(SOURCE_ID["Poetry Foundation"]))
-            .select(["id", "text", "source"])
-        )
-        # convert polars dataframe to arrow table, cast to our schema to
-        # align types (large string vs string), then write out in batches
-        for batch in pf_df.to_arrow().cast(target_schema=schema).to_batches():
-            pqwriter.write_batch(batch)
-    else:
-        print(
-            f"Poetry Foundation csv file not found for text compilation (expected at {POETRY_FOUNDATION_CSV})",
-            file=sys.stderr,
-        )
 
-    # close the parquet file
-    pqwriter.close()
+    # for each corpus, load poem metadata into a polars dataframe,
+    # rename id to poem_id, and add a column with the corpus id
+    for ref_corpus in fulltext_corpora():
+        # NOTE: could enable progress bar here... but needs
+        # prefix/context
+        corpus_text = pl.from_dicts(
+            ref_corpus.get_text(disable_progress=False)
+        ).with_columns(ref_corpus=pl.lit(ref_corpus.corpus_id))
+        poem_text.extend(corpus_text)
+    poem_text.write_parquet(output_file)
 
 
 def compile_metadata(data_dir, output_file):
@@ -146,97 +104,9 @@ def compile_metadata(data_dir, output_file):
     # for poem dataset output, we need poem id, author, and title
     # to match text results, we need poem id and source id
 
-    schema = pa.schema(
-        [
-            ("id", pa.string()),
-            ("source", pa.string()),
-            ("author", pa.string()),
-            ("title", pa.string()),
-        ]
-    )
-    # open a parquet writer for outputting content in batches
-    pqwriter = pq.ParquetWriter(output_file, schema)
-
-    # TODO: prioritize internet-poems matches over CH
-
-    # load chadwyck healey metadata
-    if CHADWYCK_HEALEY_CSV.exists():
-        # use polars to read in the csv and convert to the format we want
-        # - rename main title to title
-        # - add source id for all rows
-        # - combine author first and last name
-        # - reorder and limit columns to match parquet schema
-        df = (
-            # ignore parse errors in fields we don't care about (author_dob)
-            pl.read_csv(CHADWYCK_HEALEY_CSV, ignore_errors=True)
-            .rename({"title_main": "title"})
-            .with_columns(source=pl.lit(SOURCE_ID["Chadwyck-Healey"]))
-            .with_columns(
-                pl.concat_str(
-                    [pl.col("author_firstname"), pl.col("author_lastname")],
-                    separator=" ",
-                ).alias("author")
-            )
-            .select(["id", "source", "author", "title"])
-        )
-        # convert polars dataframe to arrow table, cast to our schema to
-        # align types (large string vs string), then write out in batches
-        for batch in df.to_arrow().cast(target_schema=schema).to_batches():
-            pqwriter.write_batch(batch)
-    else:
-        print(
-            f"Chadwyck-Healey csv file not found for metadata compilation (expected at {CHADWYCK_HEALEY_CSV})",
-            file=sys.stderr,
-        )
-
-    # for the directory of internet poems, metadata is embedded in file name
-    internet_poems_dir = data_dir / "internet-poems"
-    # this directory is a set of manually curated texts;
-    # currently only 112 files, so don't worry about chunking until needed
-    poem_files = list(internet_poems_dir.glob("*.txt"))
-    # use filename without .txt as poem identifier
-    ids = [p.stem for p in poem_files]
-    # filename is : Firstname-Lastname_Poem-Title.txt
-    # author name: filename before the _ with dashes replaced with spaces
-    authors = [p.stem.split("_", 1)[0].replace("-", " ") for p in poem_files]
-    # title: same as author for the text after the _
-    titles = [p.stem.split("_", 1)[1].replace("-", " ") for p in poem_files]
-    source = [SOURCE_ID["internet-poems"]] * len(ids)
-
-    # create a record batch to write out
-    record_batch = pa.RecordBatch.from_arrays(
-        [ids, source, authors, titles], names=["id", "source", "author", "title"]
-    )
-    pqwriter.write_batch(record_batch)
-
-    # load poetry foundation data from csv file
-    # do this one last since it is least preferred of our sources
-    if POETRY_FOUNDATION_CSV.exists():
-        # use polars to read in the csv and convert to the format we want
-        # - rename columns to match desired output
-        # - add source id
-        # - reorder and limit columns to match parquet schema
-        df = (
-            pl.read_csv(POETRY_FOUNDATION_CSV)
-            # .drop("Content", "")
-            .rename(
-                {"Author": "author", "Title": "title", "Poetry Foundation ID": "id"}
-            )
-            .with_columns(source=pl.lit(SOURCE_ID["Poetry Foundation"]))
-            .select(["id", "source", "author", "title"])
-        )
-        # convert polars dataframe to arrow table, cast to our schema to
-        # align types (large string vs string), then write out in batches
-        for batch in df.to_arrow().cast(target_schema=schema).to_batches():
-            pqwriter.write_batch(batch)
-    else:
-        print(
-            f"Poetry Foundation csv file not found for metadata compilation (expected at {POETRY_FOUNDATION_CSV})",
-            file=sys.stderr,
-        )
-
-    # close the parquet file
-    pqwriter.close()
+    meta_df = compile_metadata_df()
+    # TODO: add an option to request full-text corpora only?
+    meta_df.write_parquet(output_file)
 
 
 # unicode line separator; used in some internet poems text files
@@ -253,6 +123,7 @@ def _text_for_search(expr):
         .str.replace_all(r"(\w) \| -(\w)", "$1$2")
         # replace other punctuation with spaces
         .str.replace_all("[[:punct:]]", " ")
+        .str.replace_all("\\*", "")  # remove asterisk
         .str.replace_all(
             LINE_SEPARATOR, "\n"
         )  # replace unicode line separator with newline
@@ -338,20 +209,6 @@ def multiple_matches(filtered_ref_df):
     if match_df is not None:
         return match_df, reason
 
-    # if author/title duplication check failed, check for author matches
-    # poetry foundation includes Shakespeare drama excerpts with alternate names
-    authordupe_df = df.filter(df.select(["_author"]).is_duplicated())
-    if not authordupe_df.is_empty():
-        # Shakespeare shows up oddly in poetry foundation;
-        # if author matches, assume the other source has the correct title
-        non_poetryfoundtn = authordupe_df.filter(
-            pl.col("source") != SOURCE_ID["Poetry Foundation"]
-        )
-        if non_poetryfoundtn.height == 1:
-            match_df = non_poetryfoundtn.limit(1)
-            reason = "duplicate author but not title; excluding Poetry Foundation"
-            return match_df, reason
-
     return None, None
 
 
@@ -385,9 +242,9 @@ def identify_excerpt(
     )
     match_info = None
     result = None
-    # preserve any notes on the incoming excerpt
-    # (is this what we want? notes might get duplicated if/when we merge...)
-    note_lines = [excerpt_row["notes"]] if excerpt.notes is not None else []
+    # notes on the incoming excerpt are ignored, since we assume
+    # the output will be merged.
+    # If we preserve existing notes here they will be duplicated on merge.
 
     search_field = f"search_{search_text}"
     search_field_label = search_text.replace("_", " ")
@@ -450,8 +307,6 @@ def identify_excerpt(
         # but we only return labeled excerpts, so out of scope for now
 
         if match_df is not None:
-            # rename columns for export
-            match_df = match_df.rename({"id": "poem_id", "source": "ref_corpus"})
             # get the first row as a dictionary
             match_info = match_df.row(0, named=True)
 
@@ -489,9 +344,7 @@ def identify_excerpt(
                 ]
 
             # add note about how the match was determined
-            # return as new field; must be merged with notes in calling code
-            note_lines.append(f"{SCRIPT_ID}: {id_note}")
-            match_info["notes"] = "\n".join(note_lines).strip()
+            match_info["notes"] = f"{SCRIPT_ID}: {id_note}"
             # set id method
             match_info["identification_methods"] = {SCRIPT_ID}
 
@@ -596,13 +449,13 @@ def process(input_file, output_file, recompile=False):
     reference_df = pl.read_parquet(TEXT_PARQUET_FILE)
     meta_df = pl.read_parquet(META_PARQUET_FILE)
     print(f"Poetry reference text data: {reference_df.height:,} entries")
-    print("total by source")
-    source_counts = reference_df["source"].value_counts()
+    print("total by reference corpus")
+    source_counts = reference_df["ref_corpus"].value_counts()
     for value, count in source_counts.iter_rows():
         # row is a tuple of value, count
         print(f"\t{value}: {count:,}")
 
-    # some texts from poetry foundation and maybe Chadwyck-Healey are truncated
+    # some texts from  Chadwyck-Healey are truncated
     # discard them to avoid bad partial/fuzzy matches
     reference_df = reference_df.with_columns(text_length=pl.col("text").str.len_chars())
     min_length = 15
@@ -611,8 +464,8 @@ def process(input_file, output_file, recompile=False):
     print(f"  Omitting {short_texts.height} poems with text length < {min_length}")
 
     print(f"Poetry reference metadata: {meta_df.height:,} entries")
-    print("total by source")
-    source_counts = meta_df["source"].value_counts()
+    print("total by reference corpus")
+    source_counts = meta_df["ref_corpus"].value_counts()
     for value, count in source_counts.iter_rows():
         # row is a tuple of value, count
         print(f"\t{value}: {count:,}")
@@ -621,16 +474,16 @@ def process(input_file, output_file, recompile=False):
     reference_df = reference_df.join(
         meta_df,
         # join on the combination of poem id and source id
-        on=pl.concat_str([pl.col("id"), pl.col("source")], separator="|"),
+        on=pl.concat_str([pl.col("poem_id"), pl.col("ref_corpus")], separator="|"),
         how="left",  # occasionally ids do not match,
         # e.g. Chadwyck Healey poem id we have text for but not in metadata
-    ).drop("id_right", "source_right")
+    ).drop("poem_id_right", "ref_corpus_right")
 
     # generate a simplified text field for searching
     # NOTE: this part is a bit slow
     reference_df = generate_search_text(reference_df)
 
-    # load csv with excerpt fieldnames
+    # load csv with excerpt field names
     try:
         input_df = fix_data_types(pl.read_csv(input_file, columns=EXCERPT_FIELDS))
     except pl.exceptions.NoDataError as err:
diff --git a/test/test_poetry_detection/test_refmatcha.py b/test/test_poetry_detection/test_refmatcha.py
index cca58546..bffe28f0 100644
--- a/test/test_poetry_detection/test_refmatcha.py
+++ b/test/test_poetry_detection/test_refmatcha.py
@@ -7,9 +7,7 @@
 
 from corppa.poetry_detection.core import Excerpt, LabeledExcerpt
 from corppa.poetry_detection.refmatcha import (
-    CHADWYCK_HEALEY_CSV,
     META_PARQUET_FILE,
-    POETRY_FOUNDATION_CSV,
     SCRIPT_ID,
     TEXT_PARQUET_FILE,
     compile_metadata,
@@ -64,17 +62,17 @@ def test_searchable_text(input, expected):
 
 ref_poetry_data = [
     {
-        "id": "Z200653845",
+        "poem_id": "Z200653845",
         "text": """By his wonderful work's we see plainly enough
 That the earth is the Lord's and the fullness thereof;
 When hungry and thirsty we're ready to faint,
 He seeth our need and prevents our complaint;""",
-        "source": "chadwyck-healey",
+        "ref_corpus": "chadwyck-healey",
     },
     {
-        "id": "King-James-Bible_Psalms",
+        "poem_idid": "King-James-Bible_Psalms",
         "text": "He hath made his wonderful works to be remembered",
-        "source": "internet-poems",
+        "ref_corpus": "internet-poems",
     },
 ]
 
@@ -123,8 +121,8 @@ def test_identify_excerpt(reference_df):
     # single match
     id_result = identify_excerpt(excerpt_row, reference_df)
     assert isinstance(id_result, LabeledExcerpt)
-    assert id_result.poem_id == ref_poetry_data[0]["id"]
-    assert id_result.ref_corpus == ref_poetry_data[0]["source"]
+    assert id_result.poem_id == ref_poetry_data[0]["poem_id"]
+    assert id_result.ref_corpus == ref_poetry_data[0]["ref_corpus"]
     # these are from the searchable version of the input text
     assert id_result.ref_span_start == 50
     assert id_result.ref_span_end == 97
@@ -137,7 +135,7 @@ def test_identify_excerpt(reference_df):
         "The earth    is the\nLords\t\tand the fullness thereof"
     )
     id_result = identify_excerpt(excerpt_row, reference_df)
-    assert id_result.poem_id == ref_poetry_data[0]["id"]
+    assert id_result.poem_id == ref_poetry_data[0]["poem_id"]
 
     # no match
     excerpt_row["search_text"] = "Disdain forbids me and my dread of shame"
@@ -169,7 +167,7 @@ def test_identify_excerpt_first_line(reference_df):
     # single match
     id_result = identify_excerpt(excerpt_row, reference_df, "first_line")
     assert isinstance(id_result, LabeledExcerpt)
-    assert id_result.poem_id == ref_poetry_data[0]["id"]
+    assert id_result.poem_id == ref_poetry_data[0]["poem_id"]
     assert id_result.ref_span_start == 50
     # end and text adjusted based on length of search text
     assert id_result.ref_span_end == 142
@@ -189,7 +187,7 @@ def test_identify_excerpt_last_line(reference_df):
 
     # single match
     id_result = identify_excerpt(excerpt_row, reference_df, "last_line")
-    assert id_result.poem_id == ref_poetry_data[0]["id"]
+    assert id_result.poem_id == ref_poetry_data[0]["poem_id"]
     # start and text adjusted based on length of search text
     assert id_result.ref_span_start == 50
     assert id_result.ref_span_end == 142
@@ -220,8 +218,10 @@ def test_identify_excerpt_multiple(mock_multimatch, reference_df):
     )
     mock_multimatch.return_value = (ref_match_df.limit(1), reason)
     id_result = identify_excerpt(excerpt_row, reference_df)
-    assert id_result.poem_id == ref_poetry_data[0]["id"]
-    assert id_result.ref_corpus == ref_poetry_data[0]["source"]
+    print(id_result)
+    print(ref_poetry_data)
+    assert id_result.poem_id == ref_poetry_data[0]["poem_id"]
+    assert id_result.ref_corpus == ref_poetry_data[0]["ref_corpus"]
     assert id_result.ref_span_text == "matched text"
     assert id_result.ref_span_start == 10
     assert id_result.ref_span_end == 20
@@ -233,31 +233,35 @@ def test_identify_excerpt_multiple(mock_multimatch, reference_df):
 def test_multiple_matches():
     # title + author match (ignore case, punctuation)
     reference_data = [
-        {"author": "James Thomson", "title": "Winter", "source": "internet-poems"},
-        {"author": "James Thomson", "title": "WINTER.", "source": "chadwyck-healey"},
+        {"author": "James Thomson", "title": "Winter", "ref_corpus": "internet-poems"},
+        {
+            "author": "James Thomson",
+            "title": "WINTER.",
+            "ref_corpus": "chadwyck-healey",
+        },
     ]
     match, reason = multiple_matches(pl.from_dicts(reference_data))
     assert reason == "all rows match author + title"
     # first match is returned
-    assert match["source"][0] == "internet-poems"
+    assert match["ref_corpus"][0] == "internet-poems"
 
-    # first source is prioritized when everything matches
+    # first reference corpus is prioritized when everything matches
     reference_data.reverse()
     match, reason = multiple_matches(pl.from_dicts(reference_data))
     # first match is returned
-    assert match["source"][0] == "chadwyck-healey"
+    assert match["ref_corpus"][0] == "chadwyck-healey"
 
     # similar title; current logic can't match this
     reference_data = [
         {
             "author": "Robert Burns",
             "title": "Stay My Charmer Can You Leave",
-            "source": "internet-poems",
+            "ref_corpus": "internet-poems",
         },
         {
             "author": "Robert Burns",
             "title": "STAY, MY CHARMER",
-            "source": "chadwyck-healey",
+            "ref_corpus": "chadwyck-healey",
         },
     ]
     match, reason = multiple_matches(pl.from_dicts(reference_data))
@@ -269,40 +273,22 @@ def test_multiple_matches():
         {
             "author": "James Hogg",
             "title": "Mador of the Moor",
-            "source": "internet-poems",
+            "ref_corpus": "internet-poems",
         },
         {
             "author": "James Hogg",
             "title": "The Palmers Morning Hymn",
-            "source": "internet-poems",
+            "ref_corpus": "internet-poems",
         },
         {
             "author": "James Hogg",
             "title": "MADOR OF THE MOOR. ",
-            "source": "chadwyck-healey",
+            "ref_corpus": "chadwyck-healey",
         },
     ]
     match, reason = multiple_matches(pl.from_dicts(reference_data))
     assert reason == "majority match author + title (2 out of 3)"
-    assert match["source"][0] == "internet-poems"
-
-    # poetry foundation title mismatch
-    # majority match
-    reference_data = [
-        {
-            "author": "William Shakespeare",
-            "title": "As You Like It",
-            "source": "internet-poems",
-        },
-        {
-            "author": "William Shakespeare",
-            "title": "Song: Blow, blow thou...",
-            "source": "poetry-foundation",
-        },
-    ]
-    match, reason = multiple_matches(pl.from_dicts(reference_data))
-    assert reason == "duplicate author but not title; excluding Poetry Foundation"
-    assert match["source"][0] == "internet-poems"
+    assert match["ref_corpus"][0] == "internet-poems"
 
 
 @pytest.fixture
@@ -338,18 +324,11 @@ def chadwyck_healey_csv(tmp_path):
     return ch_meta_csv
 
 
-@pytest.fixture
-def poetry_foundation_csv(tmp_path):
-    "fixture to create a test version of the poetry foundation csv file"
-    pfound_csv = tmp_path / POETRY_FOUNDATION_CSV
-    pfound_csv.parent.mkdir(exist_ok=True)
-    pfound_csv.write_text(""",Author,Title,Poetry Foundation ID,Content
-0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what I hope ...\"""")
-    return pfound_csv
-
-
 def test_compile_text(
-    tmp_path, capsys, internet_poem, chadwyck_healey_poem, poetry_foundation_csv
+    tmp_path,
+    capsys,
+    internet_poem,
+    chadwyck_healey_poem,
 ):
     os.chdir(tmp_path)
     # output file to be created
@@ -370,21 +349,9 @@ def test_compile_text(
     assert text_row["id"] == internet_poem.stem
     assert text_row["text"].startswith("ARMA virumque cano")
     assert text_row["source"] == "internet_poems"
-    text_row = text_df.row(2, named=True)
-    assert text_row["id"] == "55489"
-    assert text_row["text"].startswith("Dear Writers, I’m compiling")
-    assert text_row["source"] == "poetry-foundation"
-
-    # should get a warning if poetry foundation csv is missing
-    poetry_foundation_csv.unlink()
-    compile_text(tmp_path, text_file)
-    captured = capsys.readouterr()
-    assert "Poetry Foundation csv file not found for text compilation" in captured.err
 
 
-def test_compile_metadata(
-    tmp_path, capsys, internet_poem, poetry_foundation_csv, chadwyck_healey_csv
-):
+def test_compile_metadata(tmp_path, capsys, internet_poem, chadwyck_healey_csv):
     os.chdir(tmp_path)
     # output file to be created
     metadata_file = tmp_path / "metadata.parquet"
@@ -405,21 +372,12 @@ def test_compile_metadata(
     assert poem_meta["author"] == "Virgil"
     assert poem_meta["title"] == "Aeneid"
     assert poem_meta["source"] == "internet_poems"
-    poem_meta = meta_df.row(2, named=True)
-    assert poem_meta["id"] == "55489"
-    assert poem_meta["source"] == "poetry-foundation"
-    assert poem_meta["author"] == "Wendy Videlock"
-    assert poem_meta["title"] == "!"
 
     # when CSVs are not found, should see error messages
-    poetry_foundation_csv.unlink()
     chadwyck_healey_csv.unlink()
     compile_metadata(tmp_path, metadata_file)
     captured = capsys.readouterr()
     assert "Chadwyck-Healey csv file not found for metadata compilation" in captured.err
-    assert (
-        "Poetry Foundation csv file not found for metadata compilation" in captured.err
-    )
 
 
 def test_process(
@@ -427,7 +385,6 @@ def test_process(
     capsys,
     internet_poem,
     chadwyck_healey_poem,
-    poetry_foundation_csv,
     chadwyck_healey_csv,
 ):
     # minimal test to run process code