Skip to content
Open
9 changes: 4 additions & 5 deletions .readthedocs.yml → .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,11 @@
version: 2

python:
version: 3.9
install:
- method: pip
path: .
- method: pip
path: .
extra_requirements:
- dev
- docs
system_packages: true

sphinx:
Expand All @@ -23,4 +20,6 @@ formats:
- pdf

build:
image: latest
os: ubuntu-22.04
tools:
python: "3.9"
50 changes: 50 additions & 0 deletions src/textacy/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,50 @@
OBJ_DEPS: set[str] = {"attr", "dobj", "dative", "oprd"}
AUX_DEPS: set[str] = {"aux", "auxpass", "neg"}

QUOTATION_MARK_PAIRS = {
# """
# Ordinal points of the token.is_quote characters, matched up by start and end.
# Some of these pairs are from weirdly formatted newspaper uploads, so could be some noise!
# source:
# either = "\"\'"
# start = "“‘```“‘«‹「『„‚"
# end = "”’’’’”’»›」』”’"
# """
(34, 34), # " "
(39, 39), # ' '
(96, 8217), # ` ’
(171, 187), # « »
(8216, 8217), # ‘ ’
(8218, 8217), # ‚ ’
(8220, 8221), # “ ”
(8222, 8221), # „ ”
(8249, 8250), # ‹ ›
(12300, 12301), # 「 」
(12302, 12303), # 『 』
(8220, 34), # “ "
(8216, 34), # ‘ "
(96, 34), # ` "
(8216, 34), # ‘ "
(171, 34), # « "
(8249, 34), # ‹ "
(12300, 34), # 「 "
(12302, 34), # 『 "
(8222, 34), # „ "
(8218, 34), # ‚ "
(34, 8221), # " ”
(34, 8217), # " ’
(34, 10), # " \n
(39, 10), # ' \n
(96, 10), # ` \n
(171, 10), # « \n
(8216, 10), # ‘ \n
(8218, 10), # ‚ \n
(8220, 10), # “ \n
(8249, 10), # ‹ \n
(12300, 10), # 「 \n
(12302, 10), # 『 \n
}

REPORTING_VERBS: dict[str, set[str]] = {
"en": {
"according",
Expand Down Expand Up @@ -196,3 +240,9 @@
)

RE_ALNUM: Pattern = re.compile(r"[^\W_]+")

# regexes for quote detection prep
ALL_QUOTES = "‹「`»」‘\"„›”‚’'』『«“"
DOUBLE_QUOTES = '‹「」»"„『”‚』›«“'
ANY_DOUBLE_QUOTE_REGEX = r"[{}]".format(DOUBLE_QUOTES)
DOUBLE_QUOTES_NOSPACE_REGEX = r"(?<=\S)([{}])(?=\S)".format(DOUBLE_QUOTES)
242 changes: 175 additions & 67 deletions src/textacy/extract/triples.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@

import collections
from operator import attrgetter
from typing import Iterable, Mapping, Optional, Pattern
from typing import Iterable, Mapping, Optional, Pattern, Literal

from cytoolz import itertoolz
from spacy.symbols import (
AUX,
VERB,
PUNCT,
agent,
attr,
aux,
Expand All @@ -30,6 +30,7 @@
xcomp,
)
from spacy.tokens import Doc, Span, Token
import re

from .. import constants, types, utils
from . import matches
Expand Down Expand Up @@ -202,13 +203,14 @@ def semistructured_statements(
)


def direct_quotations(doc: Doc) -> Iterable[DQTriple]:
def direct_quotations(doc: Doc, min_quote_length=4) -> Iterable[DQTriple]:
"""
Extract direct quotations with an attributable speaker from a document
using simple rules and patterns. Does not extract indirect or mixed quotations!

Args:
doc
min_quote_length - minimum distance (in tokens) between potentially paired quotation marks.

Yields:
Next direct quotation in ``doc`` as a (speaker, cue, content) triple.
Expand All @@ -217,85 +219,88 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]:
Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic
Tagging of Reported Speech in Newspaper Articles".
"""
# TODO: train a model to do this instead, maybe similar to entity recognition
try:
_reporting_verbs = constants.REPORTING_VERBS[doc.lang_]
except KeyError:
raise ValueError(
f"direct quotation extraction is not implemented for lang='{doc.lang_}', "
f"only {sorted(constants.REPORTING_VERBS.keys())}"
)
qtok_idxs = [tok.i for tok in doc if tok.is_quote]
if len(qtok_idxs) % 2 != 0:
raise ValueError(
f"{len(qtok_idxs)} quotation marks found, indicating an unclosed quotation; "
"given the limitations of this method, it's safest to bail out "
"rather than guess which quotation is unclosed"
)
qtok_pair_idxs = list(itertoolz.partition(2, qtok_idxs))
for qtok_start_idx, qtok_end_idx in qtok_pair_idxs:
content = doc[qtok_start_idx : qtok_end_idx + 1]
# pairs up quotation-like characters based on acceptable start/end combos
# see constants for more info
qtoks = [tok for tok in doc if tok.is_quote or (re.match(r"\n", tok.text))]
qtok_idx_pairs = [(-1, -1)]
for n, q in enumerate(qtoks):
if (
not bool(q.whitespace_)
and q.i not in [q_[1] for q_ in qtok_idx_pairs]
and q.i > qtok_idx_pairs[-1][1]
):
for q_ in qtoks[n + 1 :]:
if (ord(q.text), ord(q_.text)) in constants.QUOTATION_MARK_PAIRS:
qtok_idx_pairs.append((q.i, q_.i))
break
qtok_idx_pairs = qtok_idx_pairs[1:]

def filter_quote_tokens(tok):
return any(qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_idx_pairs)

for qtok_start_idx, qtok_end_idx in qtok_idx_pairs:
content = doc[qtok_start_idx:qtok_end_idx]
cue = None
speaker = None
# filter quotations by content

if (
# quotations should have at least a couple tokens
# excluding the first/last quotation mark tokens
len(content) < 4
len(content.text.split()) < min_quote_length
# filter out titles of books and such, if possible
or all(
tok.is_title
for tok in content
# if tok.pos in {NOUN, PROPN}
if not (tok.is_punct or tok.is_stop)
)
# TODO: require closing punctuation before the quotation mark?
# content[-2].is_punct is False
or all(tok.is_title for tok in content if not (tok.is_punct or tok.is_stop))
):
continue
# get window of adjacent/overlapping sentences
window_sents = (
sent
for sent in doc.sents
# these boundary cases are a subtle bit of work...
if (
(sent.start < qtok_start_idx and sent.end >= qtok_start_idx - 1)
or (sent.start <= qtok_end_idx + 1 and sent.end > qtok_end_idx)
)
)
# get candidate cue verbs in window
cue_cands = [
tok
for sent in window_sents
for tok in sent
if (
tok.pos == VERB

for window_sents in [
windower(content, "overlap"),
windower(content, "linebreaks"),
]:
# get candidate cue verbs in window
cue_candidates = [
tok
for sent in window_sents
for tok in sent
if tok.pos == VERB
and tok.lemma_ in _reporting_verbs
# cue verbs must occur *outside* any quotation content
and not any(
qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_pair_idxs
)
and not filter_quote_tokens(tok)
]
cue_candidates = sorted(
cue_candidates,
key=lambda cc: min(
abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)
),
)
]
# sort candidates by proximity to quote content
cue_cands = sorted(
cue_cands,
key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)),
)
for cue_cand in cue_cands:
if cue is not None:
break
for speaker_cand in cue_cand.children:
if speaker_cand.dep in _ACTIVE_SUBJ_DEPS:
cue = expand_verb(cue_cand)
speaker = expand_noun(speaker_cand)
for cue_cand in cue_candidates:
if cue is not None:
break
if content and cue and speaker:
yield DQTriple(
speaker=sorted(speaker, key=attrgetter("i")),
cue=sorted(cue, key=attrgetter("i")),
content=content,
)
speaker_cands = [
speaker_cand
for speaker_cand in cue_cand.children
if speaker_cand.pos != PUNCT
and not filter_quote_tokens(speaker_cand)
and (
(speaker_cand.i >= qtok_end_idx)
or (speaker_cand.i <= qtok_start_idx)
)
]
for speaker_cand in speaker_cands:
if speaker_cand.dep in _ACTIVE_SUBJ_DEPS:
cue = expand_verb(cue_cand)
speaker = expand_noun(speaker_cand)
break
if content and cue and speaker:
yield DQTriple(
speaker=sorted(speaker, key=attrgetter("i")),
cue=sorted(cue, key=attrgetter("i")),
content=doc[qtok_start_idx : qtok_end_idx + 1],
)
break


def expand_noun(tok: Token) -> list[Token]:
Expand All @@ -305,7 +310,6 @@ def expand_noun(tok: Token) -> list[Token]:
child
for tc in tok_and_conjuncts
for child in tc.children
# TODO: why doesn't compound import from spacy.symbols?
if child.dep_ == "compound"
]
return tok_and_conjuncts + compounds
Expand All @@ -317,3 +321,107 @@ def expand_verb(tok: Token) -> list[Token]:
child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS
]
return [tok] + verb_modifiers


def windower(quote: Span, method: Literal["overlap", "linebreaks"]) -> Iterable[Span]:
"""
Finds the range of sentences in which to look for quote attribution.

3 ways:
- "overlap": any sentences that overlap with the quote span
- "linebreaks": overlap sentences +/- one sentence, without crossing linebreaks after the quote
- None: overlap sentences +/- one sentence,

Input:
quote (Span) - quote to be attributed
method (str) - how the sentence range will be determined

Output:
sents (list) - list of sentences
"""
if method == "overlap":
return [
sent
for sent in quote.doc.sents
if (sent.start < quote.start < sent.end)
or (sent.start < quote.end < sent.end)
]
else:
sent_indexes = [
n
for n, s in enumerate(quote.doc.sents)
if (s.start <= quote.start <= s.end) or (s.start <= quote.end <= s.end)
]

i_sent = sent_indexes[0] - 1 if sent_indexes[0] > 0 else 0
j_sent = sent_indexes[-1] + 2
sents = list(quote.doc.sents)[i_sent:j_sent]
if method == "linebreaks":
linebreaks = (
[0]
+ [tok.i for tok in quote.doc if re.match(r"\n", tok.text)]
+ [quote.doc[-1].i]
)
linebreak_limits = [
lb for lb in linebreaks if sents[0].start < lb <= quote.end + 1
]
if linebreak_limits:
return [s for s in sents if s.end <= max(linebreak_limits)]
return sents


def prep_text_for_quote_detection(t: str, fix_plural_possessives: bool = True) -> str:
"""
Sorts out some common issues that trip up the quote detector. Works best one paragraph at a time -- use prep_document_for_quote_detection for the whole doc.

- replaces consecutive apostrophes with a double quote (no idea why this happens but it does)
- adds spaces before or after double quotes that don't have them
- if enabled, fixes plural possessives by adding an "x", because the hanging apostrophe can trigger quote detection.
- adds a double quote to the end of paragraphs that are continuations of quotes and thus traditionally don't end with quotation marks

Input:
t (str) - text to be prepped, preferably one paragraph
fix_plural_possessives (bool) - enables fix_plural_possessives

Output:
t (str) - text prepped for quote detection
"""
if not t:
return

t = t.replace("''", '"')
if fix_plural_possessives:
t = re.sub(r"(.{3,8}s\')(\s)", r"\1x\2", t)
while re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p):
match = re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p)
if (
len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[: match.start()])) % 2
!= 0
):
replacer = '" '
else:
replacer = ' "'
p = p[: match.start()] + replacer + p[match.end() :]
if (
not (p[0] == "'" and p[-1] == "'")
and p[0] in constants.ALL_QUOTES
and len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[1:])) % 2 == 0
):
p += '"'
return p.strip()


def prep_document_for_quote_detection(t: str, para_char: str = "\n") -> str:
"""
Splits text into paragraphs (on para_char), runs prep_text_for_quote_detection on all paragraphs, then reassembles with para_char.

Input:
t (str) - document to prep for quote detection
para_char (str) - paragraph boundary in t

Output:
document prepped for quote detection
"""
return para_char.join(
[prep_text_for_quote_detection(t) for t in t.split(para_char) if t]
)
Loading