Skip to content

Commit

Permalink
Use more conventional wildcards for pattern matching. (#16)
Browse files Browse the repository at this point in the history
The usual ? and * wildcards are more intuitive. Also, there's no reason to
expose the user to the fact that we're using SQLite under the hood.

We rename the 'partial' property to 'is_pattern', which is more intuitive. The
same option is also used for pattern matching on paths for consistency.
  • Loading branch information
LTLA authored Nov 5, 2024
1 parent c8ddf96 commit 3c339b9
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 69 deletions.
18 changes: 11 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,9 @@ The nature of the search depends on the value of `type`:
Matches to tokens are only considered within the named property.
Properties of nested objects can be specified via `.`-delimited names, e.g., `authors.first`.
If `field` is not specified, matches are not restricted to any single property within a file.
- (optional) `partial`, a boolean indicating whether to perform a partial match.
If `true`, any SQL wildcards (`%` and `_`) in `text` will not be discarded during tokenization.
Wildcard-containing tokens are then used for pattern matching to metadata-derived tokens.
- (optional) `is_pattern`, a boolean indicating whether `text` is a wildcard-containing pattern.
Currently supported wildcards are `*`, for any number of any characters; and `?`, for a match to any single character.
If `true`, wildcards will be preserved by tokenization and used for pattern matching to metadata-derived tokens.
Defaults to `false`.
- For `"user"`, SewerRat searches on the user names of the file owners.
The search clause should contain the `user` property, a string which contains the user name.
Expand All @@ -217,9 +217,8 @@ The nature of the search depends on the value of `type`:
Defaults to `false`.
- (optional) `is_suffix`, a boolean indicating whether to search for absolute paths that end with `path`.
Defaults to `false`.
- (optional) `escape`, a string containing a single escape character for SQLite pattern matching.
If provided, `path` is used verbatim as a SQLite pattern to be matched to the absolute path.
Any existing wildcards in `path` (or escapes thereof) are respected.
- (optional) `is_pattern`, a boolean indicating whether `path` is a wildcard-containing pattern, see the equivalent field for `text`.
Defaults to `false`.
- For `"time"`, SewerRat searches on the latest modification time of each file.
The search clause should contain the following additional properties:
- `time`, an integer containing the Unix time.
Expand Down Expand Up @@ -267,7 +266,12 @@ publication.author.first_name: Aaron
Note that this scoping-by-field does not extend to the `AND`, `OR` and `NOT` keywords,
e.g., `title:foo OR bar` will not limit the search for `bar` to the `title` field.

If a `%` wildcard is present in a search term, its local search clause is set to perform a partial search.
If a `*` or `?` wildcard is present in a search term, pattern matching will be performed to the metadata-derived tokens.
This only applies to the search clause immediately containing the term, e.g., `foo*` and `bar` will be used for pattern matching but `whee` and `stuff` will not.

```
(foo* bar) AND (whee stuff)
```

The human-friendly mode can be enabled by setting the `translate=true` query parameter in the request to the `/query` endpoint.
The structure of the request body is unchanged except that any `text` field is assumed to contain a search string and will be translated into the relevant search clause.
Expand Down
16 changes: 12 additions & 4 deletions database_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1245,13 +1245,21 @@ func TestQueryTokens(t *testing.T) {
})

t.Run("partial test", func(t *testing.T) {
res, err := queryTokens(dbconn, &searchClause{ Type: "text", Text: "%ar%", Partial: true }, nil, 0)
res, err := queryTokens(dbconn, &searchClause{ Type: "text", Text: "%ar%", IsPattern: true }, nil, 0)
if err != nil {
t.Fatalf(err.Error())
}
if !equalPathArrays(extractSortedPaths(res), []string{ "metadata.json", "stuff/metadata.json", "stuff/other.json" }, to_add) {
t.Fatalf("search results are not as expected %v", res)
}

res, err = queryTokens(dbconn, &searchClause{ Type: "text", Text: "l_mb", IsPattern: true }, nil, 0)
if err != nil {
t.Fatalf(err.Error())
}
if !equalPathArrays(extractSortedPaths(res), []string{ "metadata.json", "stuff/other.json" }, to_add) {
t.Fatalf("search results are not as expected %v", res)
}
})

t.Run("search on numbers", func(t *testing.T) {
Expand Down Expand Up @@ -1337,7 +1345,7 @@ func TestQueryTokens(t *testing.T) {
dbconn,
&searchClause{
Type: "not",
Child: &searchClause{ Type: "text", Text: "%ar%", Partial: true },
Child: &searchClause{ Type: "text", Text: "%ar%", IsPattern: true },
},
nil,
0,
Expand Down Expand Up @@ -1464,8 +1472,8 @@ func TestQueryTokens(t *testing.T) {
&searchClause{
Type: "or",
Children: []*searchClause{
&searchClause{ Type: "text", Text: "aar%", Partial: true },
&searchClause{ Type: "text", Text: "ak%", Partial: true },
&searchClause{ Type: "text", Text: "aar%", IsPattern: true },
&searchClause{ Type: "text", Text: "ak%", IsPattern: true },
},
},
nil,
Expand Down
21 changes: 21 additions & 0 deletions handlers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,27 @@ func TestQueryHandler(t *testing.T) {
}
})

t.Run("wildcards", func (t *testing.T) {
req, err := http.NewRequest("POST", "/query?translate=true", strings.NewReader(`{ "type": "text", "text": "l?mb OR chick*" }`))
if err != nil {
t.Fatal(err)
}

rr := httptest.NewRecorder()
handler.ServeHTTP(rr, req)
if rr.Code != http.StatusOK {
t.Fatalf("should have succeeded")
}

all_paths, scroll := validateSearchResults(rr.Body)
if scroll != "" {
t.Fatalf("unexpected scroll %v", scroll)
}
if !equalPathArrays(all_paths, []string{ "metadata.json", "stuff/other.json" }, to_add) {
t.Fatalf("unexpected paths %v", all_paths)
}
})

t.Run("scroll", func (t *testing.T) {
dummy_query := `{ "type": "text", "text": " " }`

Expand Down
61 changes: 33 additions & 28 deletions query.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@ type searchClause struct {
Type string `json:"type"`

// Only relevant for type = path.
// - Before sanitization: if Escape is empty, Path is assumed to contain a substring of the path, to be extended at the front/back depending on IsPrefix and IsSuffix.
// If Escape is not empty, Path is assumed to be a wildcard-containing pattern.
// - After sanitization: Path is a wildcard-containing pattern.
// - Before sanitization: if IsPattern = false, Path is assumed to contain a substring of the path, to be extended at the front/back depending on IsPrefix and IsSuffix.
// If IsPattern = true, Path is assumed to be a pattern with the non-SQLite wildcards.
// Escape is obviously empty.
// - After sanitization: Path is a SQLite-wildcard-containing pattern.
// Escape may or may not be an empty string, depending on whether Path needed escaping of wildcard characters.
// IsPrefix and IsSuffix are no longer used.
// IsPrefix, IsPattern and IsSuffix are no longer used.
Path string `json:"path"`
Escape string `json:"escape"`
Escape string `json:"-"`
IsPrefix bool `json:"is_prefix"`
IsSuffix bool `json:"is_suffix"`

Expand All @@ -30,10 +31,12 @@ type searchClause struct {

// Only relevant for text.
// - Before sanitization: Text may consist of multiple tokens, effectively combined with an AND statement.
// - After sanitization: Text will consist of only one token (possibly with wildcards if Partial = true, otherwise there will be no wildcards).
// Each term may have conventional (non-SQLite) wildcards, i.e., ?, *.
// - After sanitization: Text will consist of only one token.
// The token may contain SQLite wildcards if IsPattern = true, otherwise there will be no wildcards.
Text string `json:"text"`
Field string `json:"field"`
Partial bool `json:"partial"`
IsPattern bool `json:"is_pattern"`

// Only relevant for type = and/or.
// - Before sanitization: any child may be an AND (for type = and) or OR (for type = or) clause, and there may be any number of children.
Expand Down Expand Up @@ -145,7 +148,7 @@ func sanitizeQuery(original *searchClause, deftok, wildtok *unicodeTokenizer) (*
if original.Type == "text" {
var tokens []string
var err error
if original.Partial {
if original.IsPattern {
tokens, err = wildtok.Tokenize(original.Text)
} else {
tokens, err = deftok.Tokenize(original.Text)
Expand All @@ -159,7 +162,7 @@ func sanitizeQuery(original *searchClause, deftok, wildtok *unicodeTokenizer) (*

replacements := []*searchClause{}
for _, tok := range tokens {
replacements = append(replacements, &searchClause{ Type: "text", Partial: original.Partial, Field: original.Field, Text: tok })
replacements = append(replacements, &searchClause{ Type: "text", IsPattern: original.IsPattern, Field: original.Field, Text: tok })
}
if len(replacements) == 1 {
return replacements[0], nil
Expand All @@ -172,24 +175,26 @@ func sanitizeQuery(original *searchClause, deftok, wildtok *unicodeTokenizer) (*
}

if original.Type == "path" {
if original.Escape != "" {
if len(original.Escape) != 1 {
return nil, fmt.Errorf("'escape' must be a single character (got %s)", original.Escape)
}
return &searchClause { Type: "path", Path: original.Path, Escape: original.Escape }, nil
} else {
pattern, escape, err := escapeWildcards(original.Path)
if err != nil {
return nil, fmt.Errorf("failed to escape wildcards for path %q; %w", original.Path, err)
}
if !original.IsPrefix {
pattern = "%" + pattern
}
if !original.IsSuffix {
pattern += "%"
}
return &searchClause { Type: "path", Path: pattern, Escape: escape }, nil
pattern, escape, err := escapeWildcards(original.Path)
if err != nil {
return nil, fmt.Errorf("failed to escape wildcards for path %q; %w", original.Path, err)
}

if original.IsPattern {
rep := strings.NewReplacer(
"?", "_",
"*", "%",
)
pattern = rep.Replace(pattern)
}
if !original.IsPrefix && !strings.HasPrefix(pattern, "%") {
pattern = "%" + pattern
}
if !original.IsSuffix && !strings.HasSuffix(pattern, "%") {
pattern += "%"
}

return &searchClause { Type: "path", Path: pattern, Escape: escape }, nil
}

return nil, fmt.Errorf("unknown search type %q", original.Type)
Expand All @@ -208,7 +213,7 @@ func assembleFilter(query *searchClause) (string, []interface{}) {
}

filter += " tokens.token"
if query.Partial {
if query.IsPattern {
filter += " LIKE"
} else {
filter += " ="
Expand Down Expand Up @@ -279,7 +284,7 @@ func assembleFilter(query *searchClause) (string, []interface{}) {

for _, tchild := range text {
current := "tokens.token"
if tchild.Partial {
if tchild.IsPattern {
current += " LIKE"
} else {
current += " ="
Expand Down
43 changes: 21 additions & 22 deletions query_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,17 +254,26 @@ func TestSanitizeQuery(t *testing.T) {
}
}

{
t.Run("wildcards", func(t *testing.T) {
// Wildcards are respected.
query := &searchClause { Type: "text", Text: " Harvest%", Partial: true }
query := &searchClause { Type: "text", Text: " Harvest*", IsPattern: true }
san, err := sanitizeQuery(query, deftok, wildtok)
if err != nil {
t.Fatalf(err.Error())
}
if san == nil || san.Type != "text" || san.Text != "harvest%" || !san.Partial {
if san == nil || san.Type != "text" || san.Text != "harvest%" || !san.IsPattern {
t.Fatalf("unexpected result from sanitization %v", san)
}
}

query = &searchClause { Type: "text", Text: "mo?n ", IsPattern: true }
san, err = sanitizeQuery(query, deftok, wildtok)
if err != nil {
t.Fatalf(err.Error())
}
if san == nil || san.Type != "text" || san.Text != "mo_n" || !san.IsPattern {
t.Fatalf("unexpected result from sanitization %v", san)
}
})

{
// Fields are respected.
Expand Down Expand Up @@ -313,37 +322,27 @@ func TestSanitizeQuery(t *testing.T) {
}
}

// Path works with wildcard tokens.
{
query := &searchClause { Type: "path", Path: "foo%bar" }
t.Run("escaped wildcards", func(t *testing.T) {
query := &searchClause { Type: "path", Path: "foo%ba_r" }
san, err := sanitizeQuery(query, deftok, wildtok)
if err != nil {
t.Fatalf(err.Error())
}
if san == nil || san.Type != "path" || san.Path != "%foo\\%bar%" || san.Escape != "\\" {
if san == nil || san.Type != "path" || san.Path != "%foo\\%ba\\_r%" || san.Escape != "\\" {
t.Fatalf("unexpected result from sanitization %v", san)
}
}
})

// If Escape is supplied, we run verbatim.
{
query := &searchClause { Type: "path", Path: "%foo%bar", Escape: "~" }
t.Run("translate wildcards", func(t *testing.T) {
query := &searchClause { Type: "path", Path: "/f*oo_b?r", IsPattern: true }
san, err := sanitizeQuery(query, deftok, wildtok)
if err != nil {
t.Fatalf(err.Error())
}
if san == nil || san.Type != "path" || san.Path != "%foo%bar" || san.Escape != "~" {
if san == nil || san.Type != "path" || san.Path != "%/f%oo\\_b_r%" || san.Escape != "\\" {
t.Fatalf("unexpected result from sanitization %v", san)
}
}

{
query := &searchClause { Type: "path", Path: "foo%bar", Escape: "abcd" }
_, err := sanitizeQuery(query, deftok, wildtok)
if err == nil || !strings.Contains(err.Error(), "single character") {
t.Fatal("expected sanitization failure")
}
}
})
})

t.Run("other", func(t *testing.T) {
Expand Down
17 changes: 16 additions & 1 deletion tokenize.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,33 @@ import (
type unicodeTokenizer struct {
Stripper transform.Transformer
Splitter *regexp.Regexp
Converter *strings.Replacer
}

func newUnicodeTokenizer(allow_wildcards bool) (*unicodeTokenizer, error) {
pattern := ""
if allow_wildcards {
pattern = "%_"
pattern = "*?"
}

comp, err := regexp.Compile("[^\\p{L}\\p{N}\\p{Co}" + pattern + "-]+")
if err != nil {
return nil, fmt.Errorf("failed to compile regex; %w", err)
}

var replacer *strings.Replacer
if allow_wildcards {
// Convert the usual wildcards to SQLite wildcards.
replacer = strings.NewReplacer(
"?", "_",
"*", "%",
)
}

return &unicodeTokenizer {
Stripper: transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC),
Splitter: comp,
Converter: replacer,
}, nil
}

Expand All @@ -43,8 +54,12 @@ func (u *unicodeTokenizer) Tokenize(x string) ([]string, error) {

final := []string{}
present := map[string]bool{}

for _, t := range output {
if len(t) > 0 {
if u.Converter != nil {
t = u.Converter.Replace(t)
}
if _, ok := present[t]; !ok {
final = append(final, t)
present[t] = true
Expand Down
2 changes: 1 addition & 1 deletion tokenize_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func TestUnicodeTokenizer(t *testing.T) {
}

{
out, err := tok.Tokenize(" Aar%\thad a little\n l_mb ")
out, err := tok.Tokenize(" Aar*\thad a little\n l?mb ")
if err != nil {
t.Fatalf(err.Error())
}
Expand Down
2 changes: 1 addition & 1 deletion translate.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ func translateTextClause(status *translationStatus, at int) error {
converted := []*searchClause{}
for _, x := range status.Words {
word := string(x)
converted = append(converted, &searchClause{ Type: "text", Text: word, Field: field, Partial: strings.Index(word, "%") >= 0 })
converted = append(converted, &searchClause{ Type: "text", Text: word, Field: field, IsPattern: strings.ContainsAny(word, "*?") })
}

var new_component *searchClause
Expand Down
11 changes: 6 additions & 5 deletions translate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,21 +44,22 @@ func TestTranslateTextQuerySimple(t *testing.T) {
}

// Recognizes partial hits.
out, err = translateTextQuery("foo%")
out, err = translateTextQuery("foo*")
if err != nil {
t.Fatal(err)
}
if out.Type != "text" || out.Text != "foo%" || !out.Partial {
if out.Type != "text" || out.Text != "foo*" || !out.IsPattern {
t.Fatal("unexpected text query")
}

out, err = translateTextQuery("foo% bar")
out, err = translateTextQuery("foo* ?bar whee")
if err != nil {
t.Fatal(err)
}
if out.Type != "and" ||
out.Children[0].Text != "foo%" || !out.Children[0].Partial ||
out.Children[1].Text != "bar" || out.Children[1].Partial {
out.Children[0].Text != "foo*" || !out.Children[0].IsPattern ||
out.Children[1].Text != "?bar" || !out.Children[1].IsPattern ||
out.Children[2].Text != "whee" || out.Children[2].IsPattern {
t.Fatal("unexpected text query")
}

Expand Down

0 comments on commit 3c339b9

Please sign in to comment.