-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add local spam filter * optimize, don't tokenize spam samples on each check * simplify spam params
- Loading branch information
Showing
5 changed files
with
226 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
package bot | ||
|
||
import ( | ||
"bufio" | ||
"fmt" | ||
"io" | ||
"log" | ||
"math" | ||
"strings" | ||
) | ||
|
||
// SpamLocalFilter bot, checks if user is a spammer using internal matching | ||
type SpamLocalFilter struct { | ||
dry bool | ||
superUser SuperUser | ||
threshold float64 | ||
|
||
enabled bool | ||
tokenizedSpam []map[string]int | ||
approvedUsers map[int64]bool | ||
} | ||
|
||
// NewSpamLocalFilter makes a spam detecting bot | ||
func NewSpamLocalFilter(spamSamples io.Reader, threshold float64, superUser SuperUser, dry bool) *SpamLocalFilter { | ||
log.Printf("[INFO] Spam bot (local), threshold=%0.2f", threshold) | ||
res := &SpamLocalFilter{dry: dry, approvedUsers: map[int64]bool{}, superUser: superUser, threshold: threshold} | ||
|
||
scanner := bufio.NewScanner(spamSamples) | ||
for scanner.Scan() { | ||
tokenizedSpam := res.tokenize(scanner.Text()) | ||
res.tokenizedSpam = append(res.tokenizedSpam, tokenizedSpam) | ||
} | ||
if err := scanner.Err(); err != nil { | ||
log.Printf("[WARN] failed to read spam samples, error=%v", err) | ||
res.enabled = false | ||
} else { | ||
res.enabled = true | ||
} | ||
return res | ||
} | ||
|
||
// OnMessage checks if user already approved and if not checks if user is a spammer | ||
func (s *SpamLocalFilter) OnMessage(msg Message) (response Response) { | ||
if !s.enabled { | ||
return Response{} | ||
} | ||
|
||
if s.approvedUsers[msg.From.ID] { | ||
return Response{} | ||
} | ||
|
||
if s.superUser.IsSuper(msg.From.Username) { | ||
return Response{} // don't check super users for spam | ||
} | ||
|
||
if !s.isSpam(msg.Text) { | ||
log.Printf("[INFO] user %s is not a spammer, added to aproved", msg.From.Username) | ||
s.approvedUsers[msg.From.ID] = true | ||
return Response{} // not a spam | ||
} | ||
|
||
log.Printf("[INFO] user %s detected as spammer, msg: %q", msg.From.Username, msg.Text) | ||
if s.dry { | ||
return Response{ | ||
Text: fmt.Sprintf("this is spam from %q, but I'm in dry mode, so I'll do nothing yet", msg.From.Username), | ||
Send: true, ReplyTo: msg.ID, | ||
} | ||
} | ||
return Response{Text: "this is spam! go to ban, " + msg.From.DisplayName, Send: true, ReplyTo: msg.ID, BanInterval: permanentBanDuration, DeleteReplyTo: true} | ||
} | ||
|
||
// Help returns help message | ||
func (s *SpamLocalFilter) Help() string { return "" } | ||
|
||
// ReactOn keys | ||
func (s *SpamLocalFilter) ReactOn() []string { return []string{} } | ||
|
||
// isSpam checks if a given message is similar to any of the known bad messages. | ||
func (s *SpamLocalFilter) isSpam(message string) bool { | ||
tokenizedMessage := s.tokenize(message) | ||
maxSimilarity := 0.0 | ||
for _, spam := range s.tokenizedSpam { | ||
similarity := s.cosineSimilarity(tokenizedMessage, spam) | ||
if similarity > maxSimilarity { | ||
maxSimilarity = similarity | ||
} | ||
if similarity >= s.threshold { | ||
return true | ||
} | ||
} | ||
log.Printf("[DEBUG] spam similarity: %0.2f", maxSimilarity) | ||
return false | ||
} | ||
|
||
// tokenize takes a string and returns a map where the keys are unique words (tokens) | ||
// and the values are the frequencies of those words in the string. | ||
func (s *SpamLocalFilter) tokenize(inp string) map[string]int { | ||
tokenFrequency := make(map[string]int) | ||
tokens := strings.Fields(inp) | ||
for _, token := range tokens { | ||
tokenFrequency[strings.ToLower(token)]++ | ||
} | ||
return tokenFrequency | ||
} | ||
|
||
// cosineSimilarity calculates the cosine similarity between two token frequency maps. | ||
func (s *SpamLocalFilter) cosineSimilarity(a, b map[string]int) float64 { | ||
if len(a) == 0 || len(b) == 0 { | ||
return 0.0 | ||
} | ||
|
||
dotProduct := 0 // sum of product of corresponding frequencies | ||
normA, normB := 0, 0 // square root of sum of squares of frequencies | ||
|
||
for key, val := range a { | ||
dotProduct += val * b[key] | ||
normA += val * val | ||
} | ||
for _, val := range b { | ||
normB += val * val | ||
} | ||
|
||
if normA == 0 || normB == 0 { | ||
return 0.0 | ||
} | ||
|
||
// cosine similarity formula | ||
return float64(dotProduct) / (math.Sqrt(float64(normA)) * math.Sqrt(float64(normB))) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
package bot | ||
|
||
import ( | ||
"strings" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
|
||
"github.com/radio-t/super-bot/app/bot/mocks" | ||
) | ||
|
||
func TestSpamLocalFilter_OnMessage(t *testing.T) { | ||
superUser := &mocks.SuperUser{IsSuperFunc: func(userName string) bool { | ||
if userName == "super" || userName == "admin" { | ||
return true | ||
} | ||
return false | ||
}} | ||
spamSamples := strings.NewReader("win free iPhone\nlottery prize") | ||
|
||
filter := NewSpamLocalFilter(spamSamples, 0.5, superUser, false) | ||
|
||
tests := []struct { | ||
msg Message | ||
expected Response | ||
}{ | ||
{ | ||
Message{From: User{ID: 1, Username: "john", DisplayName: "John"}, Text: "Hello, how are you?", ID: 1}, | ||
Response{}, | ||
}, | ||
{ | ||
Message{From: User{ID: 2, Username: "spammer", DisplayName: "Spammer"}, Text: "Win a free iPhone now!", ID: 2}, | ||
Response{Text: "this is spam! go to ban, Spammer", Send: true, ReplyTo: 2, BanInterval: permanentBanDuration, DeleteReplyTo: true}, | ||
}, | ||
{ | ||
Message{From: User{ID: 3, Username: "super", DisplayName: "SuperUser"}, Text: "Win a free iPhone now!", ID: 3}, | ||
Response{}, | ||
}, | ||
} | ||
|
||
for _, test := range tests { | ||
assert.Equal(t, test.expected, filter.OnMessage(test.msg)) | ||
} | ||
} | ||
|
||
func TestIsSpam(t *testing.T) { | ||
spamSamples := strings.NewReader("win free iPhone\nlottery prize") | ||
filter := NewSpamLocalFilter(spamSamples, 0.5, nil, false) // SuperUser set to nil for this test | ||
|
||
tests := []struct { | ||
name string | ||
message string | ||
threshold float64 | ||
expected bool | ||
}{ | ||
{"Not Spam", "Hello, how are you?", 0.5, false}, | ||
{"Exact Match", "Win a free iPhone now!", 0.5, true}, | ||
{"Similar Match", "You won a lottery prize!", 0.3, true}, | ||
{"High Threshold", "You won a lottery prize!", 0.9, false}, | ||
{"Partial Match", "win free", 0.9, false}, | ||
{"Low Threshold", "win free", 0.8, true}, | ||
} | ||
|
||
for _, test := range tests { | ||
t.Run(test.name, func(t *testing.T) { | ||
filter.threshold = test.threshold // Update threshold for each test case | ||
assert.Equal(t, test.expected, filter.isSpam(test.message)) | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters