Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for regex queries #188

Merged
merged 10 commits into from
Mar 29, 2024
19 changes: 18 additions & 1 deletion core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import cats.data.NonEmptyList
import pink.cozydev.lucille.{MultiQuery, Query, TermQuery}
import pink.cozydev.protosearch.internal.PositionalIter

import java.util.regex.PatternSyntaxException

sealed abstract class IndexSearcher {
def search(q: Query): Either[String, Set[Int]]

Expand Down Expand Up @@ -94,7 +96,7 @@ object IndexSearcher {
case q: Query.UnaryPlus => Left(s"Unsupported UnaryPlus in BooleanRetrieval: $q")
case q: Query.Proximity => Left(s"Unsupported Proximity in BooleanRetrieval: $q")
case q: Query.Fuzzy => Left(s"Unsupported Fuzzy in BooleanRetrieval: $q")
case q: Query.TermRegex => Left(s"Unsupported Regex in BooleanRetrieval: $q")
case q: Query.TermRegex => regexSearch(q)
case q: Query.MinimumMatch => Left(s"Unsupported MinimumMatch in BooleanRetrieval: $q")
}

Expand Down Expand Up @@ -124,6 +126,21 @@ object IndexSearcher {
case _ => Left("Unsupport TermRange error?")
}
}

private def regexSearch(q: Query.TermRegex): Either[String, Set[Int]] = {
val regex =
try
q.str.r
catch {
case _: PatternSyntaxException => return Left(s"Invalid regex query $q")
}
val terms = index.termDict.termsForRegex(regex)
Right(
terms
.flatMap(m => index.docsWithTerm(m))
.toSet
)
}
}

def intersectSets(sets: NonEmptyList[Set[Int]]): Set[Int] =
Expand Down
29 changes: 25 additions & 4 deletions core/src/main/scala/pink/cozydev/protosearch/Scorer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@
package pink.cozydev.protosearch

import cats.data.NonEmptyList
import cats.syntax.all._
import cats.syntax.all.*
VigneshSK17 marked this conversation as resolved.
Show resolved Hide resolved
import pink.cozydev.lucille.Query
import scala.collection.mutable.{HashMap => MMap}

import scala.collection.mutable.HashMap
import pink.cozydev.lucille.MultiQuery
import pink.cozydev.protosearch.internal.PositionalIter

import java.util.regex.PatternSyntaxException

case class Scorer(index: MultiIndex, defaultOR: Boolean = true) {

private val defaultIdx: Index = index.indexes(index.schema.defaultField)
Expand Down Expand Up @@ -51,7 +54,7 @@ case class Scorer(index: MultiIndex, defaultOR: Boolean = true) {
case Query.UnaryPlus(q) => accScore(idx, NonEmptyList.one(q))
case q: Query.Proximity => Left(s"Unsupported Proximity encountered in Scorer: $q")
case q: Query.Fuzzy => Left(s"Unsupported Fuzzy encountered in Scorer: $q")
case q: Query.TermRegex => Left(s"Unsupported Regex in Scorer: $q")
case q: Query.TermRegex => regexScore(idx, docs, q)
case q: Query.MinimumMatch => Left(s"Unsupported MinimumMatch in Scorer: $q")
}
accScore(defaultIdx, NonEmptyList.one(qs)).map(combineMaps)
Expand Down Expand Up @@ -103,8 +106,26 @@ case class Scorer(index: MultiIndex, defaultOR: Boolean = true) {
case _ => Left(s"Unsupported TermRange error: $q")
}

private def regexScore(
idx: Index,
docs: Set[Int],
q: Query.TermRegex,
): Either[String, NonEmptyList[Map[Int, Double]]] = {
val regex =
try
q.str.r
catch {
case _: PatternSyntaxException => return Left(s"Invalid regex query $q")
}

NonEmptyList.fromList(idx.termDict.termsForRegex(regex)) match {
case None => Right(NonEmptyList.one(Map.empty[Int, Double]))
case Some(terms) => Right(terms.map(idx.scoreTFIDF(docs, _).toMap))
}
}

private def combineMaps(ms: NonEmptyList[Map[Int, Double]]): List[(Int, Double)] = {
val mb = MMap.empty ++ ms.head
val mb = HashMap.empty ++ ms.head
ms.tail.foreach(m1 =>
m1.foreach { case (k: Int, v: Double) => mb.update(k, v + mb.getOrElse(k, 0.0)) }
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package pink.cozydev.protosearch.internal

import scala.annotation.tailrec
import scala.util.matching.Regex

final class TermDictionary(
private val termDict: Array[String]
Expand Down Expand Up @@ -46,6 +47,10 @@ final class TermDictionary(
bldr.result()
}

/** Get the list of terms matching the regex. */
def termsForRegex(regex: Regex): List[String] =
termDict.filter(regex.findFirstIn(_).isDefined).toList

/** Get the list of terms starting with prefix . */
def indicesForPrefix(prefix: String): Array[Int] = {
var i = termIndexWhere(prefix)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,22 @@ class FrequencyIndexSearcherSuite extends munit.FunSuite {
)
}

test("regex") {
val q = search("/jump.*/ /cat/")
val results = Set(0, 1, 2)
assertEquals(
q,
Right(results),
)
}

test("regex fail") {
val q = search("/[/")
val err = "Invalid regex query TermRegex([)"
assertEquals(
q,
Left(err),
)
}

}
14 changes: 14 additions & 0 deletions core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,18 @@ class MultiIndexSuite extends munit.FunSuite {
assertEquals(books, Right(List(eggs)))
}

test("regex") {
val books = search("/e(r|e)/")
assertEquals(books, Right(List(peter, eggs)))
}

test("regex fail") {
val q = search("/[a/")
val err = "Invalid regex query TermRegex([a)"
assertEquals(
q,
Left(err),
)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -188,4 +188,22 @@ class PositionalIndexSearcherSuite extends munit.FunSuite {
val q = search("\"fakeword\"")
assertEquals(q, Right(Set.empty[Int]))
}

test("regex") {
val q = search("/f(o|a)/")
val results = Set(0, 1)
assertEquals(
q,
Right(results),
)
}

test("regex fail") {
val q = search("/[a/")
val err = "Invalid regex query TermRegex([a)"
assertEquals(
q,
Left(err),
)
}
}
9 changes: 9 additions & 0 deletions docs/queries.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,12 @@ Additionally, field queries can take more complex boolean queries if specified i
```scala mdoc
search("author:([b TO e] AND NOT dr*)")
```

## Regex Query

Regex queries allow for greater query flexibility by utilizing powerful regular expressions.

```scala mdoc
search("/jump.*")
```