-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatching.py
94 lines (80 loc) · 3.35 KB
/
matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import re
import pandas as pd
import ast
from fuzzywuzzy import fuzz
with open("./data/foodList.txt") as f:
foodList = f.read().splitlines()
food_nutrient_large = pd.read_csv('./data/food_nutrients_dict.csv')
# Reads categorized food with 16,000 rows
food_categ = pd.read_csv('./data/food_categorized_nurtrients_w_name.csv')
# Clean up foodList
foodList = list(filter(
lambda x: x not in ['baby', 'producer', 'red', '85% lean', 'baked', 'leg', 'greater than 3% juice', 'family style'
, 'polish', 'greek', 'on the border', 'tlc', 'low calorie', 'milk producer', 'producer milk', 'green', 'grade'],
foodList))
# Creates a series of food_names with comma, for regex extracting purposes
food_comma = food_categ['food_name'] + ','
# Extracts the food's generic names
food_categ['generic'] = food_comma.str.extract(r'^([^,]*),')[0]
# Get rid of commas, colons, and other special characters
food_clean_name = food_categ['food_name'].str.replace(r'[^a-zA-Z0-9 ]', "").str.replace(r' +', ' ')
# Compiles a list of foodnames with 200k data and with 16k data respectively
foodlist_large = food_nutrient_large['name'].tolist()
foodlist_categ = food_clean_name.tolist()
foodlist_generic = food_categ['generic'].tolist()
# RegEx generator to match with decimals surrounded by indices
regexp = re.compile(r'^[^a-z]+\d\.\d')
findFoodName = re.compile("([^\.]+) \d+\.")
all_scores = []
# Finds the best match of the food item on the receipt from the database. Outputs a list of dictionaries.
def match_receipt(inp):
food = []
for line in inp:
if regexp.search(line):
if findFoodName.search(line):
words = findFoodName.search(line).group(0)
bestRatio = 0
pos = 0
name = ""
for i, f in enumerate(foodlist_categ):
r = fuzz.token_set_ratio(words, f)
if r > bestRatio:
pos = i
name = f
bestRatio = r
if bestRatio > 50:
d = food_categ.loc[pos, 'nutrients']
gen = food_categ.loc[pos, 'generic']
food.append([words, name, gen, ast.literal_eval(d)])
return food
def match_receipt_advanced(inp):
final = []
clean_orig_pair = {}
for line in inp:
if regexp.search(line):
if findFoodName.search(line):
orig = findFoodName.search(line).group(0)
bestRatio = 0
name = ""
for i, f in enumerate(foodList):
r = fuzz.token_set_ratio(orig, f)
if r > bestRatio:
name = f
bestRatio = r
if bestRatio > 65:
clean_orig_pair[name] = [orig, bestRatio]
for clean in clean_orig_pair.keys():
bestRatio = 0
pos = 0
name = ""
for i, f in enumerate(foodlist_categ):
r = fuzz.token_set_ratio(clean, f) * 0.5 + fuzz.token_set_ratio(clean_orig_pair[clean][0], f) * 0.5
if r > bestRatio:
pos = i
name = f
bestRatio = r
if bestRatio > 60:
d = food_categ.loc[pos, 'nutrients']
nutri = ast.literal_eval(d)
final.append([clean_orig_pair[clean][0], clean, name, nutri])
return final