Skip to content

Commit

Permalink
Speed up name annotation (#31)
Browse files Browse the repository at this point in the history
* Speed up name annotation

* lint
  • Loading branch information
thomasyu888 authored Jan 9, 2021
1 parent 4746a79 commit a36e7b9
Showing 1 changed file with 15 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@ class Data:
def __init__(self):
# All first names (1974 to 2019) from https://www.nrscotland.gov.uk
firstnames_df = pd.read_csv("data/firstnames.csv")
self._firstnames = firstnames_df['firstname'].str.lower().unique().tolist() # noqa: E501

# Top 1000 last names from census.gov (18-10-2020)
# https://www.census.gov/topics/population/genealogy/data/2000_surnames.html
lastnames_df = pd.read_csv("data/lastnames.csv")
self._lastnames = lastnames_df['lastname'].str.lower().unique().tolist() # noqa: E501

# Append all names
names = firstnames_df['firstname'].append(lastnames_df['lastname'])
self._names = names.str.lower().unique().tolist()


data = Data()
Expand All @@ -40,27 +41,18 @@ def create_text_person_name_annotations(note=None): # noqa: E501
annotation_request = TextPersonNameAnnotationRequest.from_dict(connexion.request.get_json()) # noqa: E501
note = annotation_request._note # noqa: E501
annotations = []
for name in data._firstnames:
matches = re.finditer(
r'\b({})\b'.format(name), note._text, re.IGNORECASE)
for match in matches:
annotations.append(TextPersonNameAnnotation(
start=match.start(),
length=len(match[0]),
text=match[0],
confidence=95
))
for name in data._lastnames:
matches = re.finditer(
r'\b({})\b'.format(name), note._text, re.IGNORECASE)
for match in matches:
annotations.append(TextPersonNameAnnotation(
start=match.start(),
length=len(match[0]),
text=match[0],
confidence=95
))

for name in data._names:
if name in note._text.lower():
matches = re.finditer(
r'\b({})\b'.format(name), note._text, re.IGNORECASE)
for match in matches:
annotations.append(TextPersonNameAnnotation(
start=match.start(),
length=len(match[0]),
text=match[0],
confidence=95
))
res = TextPersonNameAnnotations(annotations)
status = 200
except Exception as error:
Expand Down

0 comments on commit a36e7b9

Please sign in to comment.