if"translation"inenor"incorrect"inenorre.search(r"\btypo\b",en):continue# alternate translation, old translation, typo, etc
en=re.sub(" ([^)]*name[^)]*)","",en)# "X (surname)" is ok if X is capitals only
if"("inen:continue# we don't want entries like "notice (on paper)" where the parens indicate it's a specific meaning
ifre.search("; [^A-Z]",en):continue# "here; in this region" probably shouldn't pull out "here", but ";" + abbreviation is probably OK, as is "Name; Name"
forenin[een.strip()foreeninre.sub("[(][^)]*[)]","",en).split(';')]:# relevant only if commenting out the "(" continue above
fortaginre.findall(r"(?:<[^>]*>\s*)+",txt,flags=re.DOTALL):e2c[tag]=tag# keep (runs of) tags, TODO: might be better if we don't make them sentence objects
fori,kinenumerate(keyList):# TODO: this loop is slow: might want to get an annogen-generated annotator to do it (but there's the \b) or make an OR list like the annogen normaliser
ifk.startswith("<"):txt=txt.replace(k," {%d} "%i)# irrespective of word boundaries
else:
txt=re.sub(r"\b"+re.escape(k)+r"\b"," {%d} "%i,txt,flags=0ifre.search("[A-Z]",k)elsere.IGNORECASE)# (don't match lower case if we have upper case, as it might be a name or abbreviation that in lower case will be a normal word and not this entry (TODO there can be false positives at start of sentences though), but do match title case if we are lower case)
sentences=re.findall(r"[^ .!?].*?(?:$|[.!?])(?=$|\s+)",txt,flags=re.DOTALL)# TODO: at one point didn't include a <p> (prob repr'd as {..}) at start of sentence, regex seems ok, did model drop it?