-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfilter.py
54 lines (42 loc) · 1.59 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re
from abc import ABC, abstractmethod
import ahocorasick
from pandas import DataFrame
# interface (abstract class)
class TextFilter(ABC):
@abstractmethod
def prepare(self, bad_words: list[str]) -> None:
"""you must call once before you calling filter function"""
pass
@abstractmethod
def is_unhealthy(self, field: str) -> bool:
pass
def filter(self, chunk: DataFrame) -> tuple[int, int]:
"""Filters Healthy and UnHealthy Rows count"""
health_filter = chunk.apply(
lambda row: not any(
map(
lambda field: self.is_unhealthy(str(field)),
row,
)
),
axis=1,
)
healthy_rows_number = sum(health_filter)
unhealthy_rows_number = len(health_filter) - healthy_rows_number
return healthy_rows_number, unhealthy_rows_number
class AhoCorasickFilter(TextFilter):
def prepare(self, bad_words: list[str]) -> None:
self.automaton = ahocorasick.Automaton()
for word in map(lambda x: x.lower(), bad_words):
self.automaton.add_word(word, word)
self.automaton.make_automaton()
def is_unhealthy(self, field: str) -> bool:
return len(list(self.automaton.iter(field.lower()))) != 0
class RegexFilter(TextFilter):
def prepare(self, bad_words: list[str]) -> None:
self.pattern = re.compile(
"|".join(map(re.escape, bad_words)), flags=re.IGNORECASE
)
def is_unhealthy(self, field: str) -> bool:
return self.pattern.search(field) is not None