From fa8f5b94bb24a7383bb6e10866c0ce880b75758d Mon Sep 17 00:00:00 2001 From: VigneshSK17 Date: Sat, 23 Mar 2024 20:23:25 -0400 Subject: [PATCH 1/9] Added basic version for regex search --- .../pink/cozydev/protosearch/IndexSearcher.scala | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala index 125280f7..efc2cf39 100644 --- a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala +++ b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala @@ -47,7 +47,7 @@ case class IndexSearcher(index: Index, defaultOR: Boolean = true) { case q: Query.UnaryPlus => Left(s"Unsupported UnaryPlus in BooleanRetrieval: $q") case q: Query.Proximity => Left(s"Unsupported Proximity in BooleanRetrieval: $q") case q: Query.Fuzzy => Left(s"Unsupported Fuzzy in BooleanRetrieval: $q") - case q: Query.TermRegex => Left(s"Unsupported Regex in BooleanRetrieval: $q") + case q: Query.TermRegex => regexSearch(q) case q: Query.MinimumMatch => Left(s"Unsupported MinimumMatch in BooleanRetrieval: $q") } @@ -78,6 +78,18 @@ case class IndexSearcher(index: Index, defaultOR: Boolean = true) { } } + private def regexSearch(q: Query.TermRegex): Either[String, Set[Int]] = + q match { + case Query.TermRegex(regexStr) => + val regex = regexStr.r + val terms = index.termDict.termsForRange("", "\uFFFF") + val matches = terms + .flatMap(regex.findFirstMatchIn(_)) + .map(m => index.termDict.termIndexWhere(m.source.toString)) + Right(matches.toSet) + case _ => Left("Unsupport Regex error") + } + private def defaultCombine(sets: NonEmptyList[Set[Int]]): Set[Int] = if (defaultOR) IndexSearcher.unionSets(sets) else IndexSearcher.intersectSets(sets) From 004eb970350f67643bdabd400e7b2f3552facaf0 Mon Sep 17 00:00:00 2001 From: VigneshSK17 Date: Sat, 23 Mar 2024 23:16:51 -0400 Subject: [PATCH 2/9] Made slight tweak to account for PositionalIndex and added test cases --- .../cozydev/protosearch/IndexSearcher.scala | 21 +++++++++---------- .../FrequencyIndexSearcherSuite.scala | 9 ++++++++ .../cozydev/protosearch/MultiIndexSuite.scala | 5 +++++ .../PositionalIndexSearcherSuite.scala | 9 ++++++++ 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala index efc2cf39..3137decf 100644 --- a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala +++ b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala @@ -78,17 +78,16 @@ case class IndexSearcher(index: Index, defaultOR: Boolean = true) { } } - private def regexSearch(q: Query.TermRegex): Either[String, Set[Int]] = - q match { - case Query.TermRegex(regexStr) => - val regex = regexStr.r - val terms = index.termDict.termsForRange("", "\uFFFF") - val matches = terms - .flatMap(regex.findFirstMatchIn(_)) - .map(m => index.termDict.termIndexWhere(m.source.toString)) - Right(matches.toSet) - case _ => Left("Unsupport Regex error") - } + private def regexSearch(q: Query.TermRegex): Either[String, Set[Int]] = { + val regex = q.str.r + val terms = index.termDict.termsForRange("", "\uFFFF") + Right( + terms + .flatMap(regex.findFirstMatchIn(_)) + .flatMap(m => index.docsWithTerm(m.source.toString)) + .toSet + ) + } private def defaultCombine(sets: NonEmptyList[Set[Int]]): Set[Int] = if (defaultOR) IndexSearcher.unionSets(sets) else IndexSearcher.intersectSets(sets) diff --git a/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala index c624c0b8..c3920542 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala @@ -124,4 +124,13 @@ class FrequencyIndexSearcherSuite extends munit.FunSuite { ) } + test("regex") { + val q = search("/a(c|l)/") + val results = Set(1, 2) + assertEquals( + q, + Right(results) + ) + } + } diff --git a/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala index 30939b98..84f6d7ce 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala @@ -123,4 +123,9 @@ class MultiIndexSuite extends munit.FunSuite { assertEquals(books, Right(List(eggs))) } + test("regex") { + val books = search("/e(r|e)/") + assertEquals(books, Right(List(peter, eggs))) + } + } diff --git a/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala index 0f16963c..20075421 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala @@ -188,4 +188,13 @@ class PositionalIndexSearcherSuite extends munit.FunSuite { val q = search("\"fakeword\"") assertEquals(q, Right(Set.empty[Int])) } + + test("regex") { + val q = search("/f(o|a)/") + val results = Set(0, 1) + assertEquals( + q, + Right(results) + ) + } } From 4915e614e8c04c0fefa5045f08b82c2574da5f70 Mon Sep 17 00:00:00 2001 From: VigneshSK17 Date: Sat, 23 Mar 2024 23:36:33 -0400 Subject: [PATCH 3/9] Added example to docs, updated test case, closes #35 --- .../protosearch/FrequencyIndexSearcherSuite.scala | 4 ++-- docs/queries.md | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala index c3920542..797bd589 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala @@ -125,8 +125,8 @@ class FrequencyIndexSearcherSuite extends munit.FunSuite { } test("regex") { - val q = search("/a(c|l)/") - val results = Set(1, 2) + val q = search("/jump.*/ /cat/") + val results = Set(0, 1, 2) assertEquals( q, Right(results) diff --git a/docs/queries.md b/docs/queries.md index f1164393..219bf1d2 100644 --- a/docs/queries.md +++ b/docs/queries.md @@ -135,3 +135,12 @@ Additionally, field queries can take more complex boolean queries if specified i ```scala mdoc search("author:([b TO e] AND NOT dr*)") ``` + +## Regex Query + +Regex queries allow for greater query flexibility by utilizing powerful regular expressions. + +```scala mdoc +search("/jump.*") +``` + From 207e7be8c531e1eed35c2f84105c2939a293405a Mon Sep 17 00:00:00 2001 From: VigneshSK17 Date: Sun, 24 Mar 2024 18:09:37 -0400 Subject: [PATCH 4/9] Added regex correction in IndexSearcher --- .../cozydev/protosearch/IndexSearcher.scala | 34 +++++++++++++------ .../FrequencyIndexSearcherSuite.scala | 9 +++++ .../cozydev/protosearch/MultiIndexSuite.scala | 10 ++++++ .../PositionalIndexSearcherSuite.scala | 9 +++++ 4 files changed, 51 insertions(+), 11 deletions(-) diff --git a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala index d518db0d..d6d16782 100644 --- a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala +++ b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala @@ -20,6 +20,8 @@ import cats.data.NonEmptyList import pink.cozydev.lucille.{MultiQuery, Query, TermQuery} import pink.cozydev.protosearch.internal.PositionalIter +import java.util.regex.PatternSyntaxException + sealed abstract class IndexSearcher { def search(q: Query): Either[String, Set[Int]] @@ -77,16 +79,6 @@ object IndexSearcher { def search(q: NonEmptyList[Query]): Either[String, Set[Int]] = q.traverse(q => search(q)).map(defaultCombine) - private def regexSearch(q: Query.TermRegex): Either[String, Set[Int]] = { - val regex = q.str.r - val terms = index.termDict.termsForRange("", "\uFFFF") - Right( - terms - .flatMap(regex.findFirstMatchIn(_)) - .flatMap(m => index.docsWithTerm(m.source.toString)) - .toSet - ) - } private def defaultCombine(sets: NonEmptyList[Set[Int]]): Set[Int] = if (defaultOR) IndexSearcher.unionSets(sets) else IndexSearcher.intersectSets(sets) @@ -107,7 +99,7 @@ object IndexSearcher { case q: Query.UnaryPlus => Left(s"Unsupported UnaryPlus in BooleanRetrieval: $q") case q: Query.Proximity => Left(s"Unsupported Proximity in BooleanRetrieval: $q") case q: Query.Fuzzy => Left(s"Unsupported Fuzzy in BooleanRetrieval: $q") - case q: Query.TermRegex => Left(s"Unsupported Regex in BooleanRetrieval: $q") + case q: Query.TermRegex => regexSearch(q) case q: Query.MinimumMatch => Left(s"Unsupported MinimumMatch in BooleanRetrieval: $q") } @@ -137,6 +129,26 @@ object IndexSearcher { case _ => Left("Unsupport TermRange error?") } } + + private def regexSearch(q: Query.TermRegex): Either[String, Set[Int]] = { + + try { + val regex = q.str.r + + val terms = index.termDict.termsForRange("", "\uFFFF") + Right( + terms + .flatMap(regex.findFirstMatchIn(_)) + .flatMap(m => index.docsWithTerm(m.source.toString)) + .toSet + ) + } catch { + case _: PatternSyntaxException => + Left(s"Invalid regex query $q provided") + } + + + } } def intersectSets(sets: NonEmptyList[Set[Int]]): Set[Int] = diff --git a/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala index 797bd589..5557020e 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala @@ -133,4 +133,13 @@ class FrequencyIndexSearcherSuite extends munit.FunSuite { ) } + test("regex fail") { + val q = search("/[/") + val err = "Invalid regex query TermRegex([) provided" + assertEquals( + q, + Left(err) + ) + } + } diff --git a/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala index 8ee03b85..063cc823 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala @@ -124,4 +124,14 @@ class MultiIndexSuite extends munit.FunSuite { assertEquals(books, Right(List(peter, eggs))) } + // TODO: Add to Scorer.scala + test("regex fail") { + val q = search("/[a/") + val err = "Invalid regex query TermRegex([) provided" + assertEquals( + q, + Left(err) + ) + } + } diff --git a/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala index 20075421..ba9b1bb7 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala @@ -197,4 +197,13 @@ class PositionalIndexSearcherSuite extends munit.FunSuite { Right(results) ) } + + test("regex fail") { + val q = search("/[a/") + val err = "Invalid regex query TermRegex([a) provided" + assertEquals( + q, + Left(err) + ) + } } From 7fbe6e5559b1d5b50cc7758fdc5f8636e075ed3a Mon Sep 17 00:00:00 2001 From: VigneshSK17 Date: Sun, 24 Mar 2024 19:38:17 -0400 Subject: [PATCH 5/9] Updated Scorer for MultiIndex --- .../cozydev/protosearch/IndexSearcher.scala | 7 +--- .../pink/cozydev/protosearch/Scorer.scala | 32 +++++++++++++++++-- .../FrequencyIndexSearcherSuite.scala | 2 +- .../cozydev/protosearch/MultiIndexSuite.scala | 3 +- .../PositionalIndexSearcherSuite.scala | 2 +- 5 files changed, 33 insertions(+), 13 deletions(-) diff --git a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala index d6d16782..f03fe760 100644 --- a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala +++ b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala @@ -131,10 +131,8 @@ object IndexSearcher { } private def regexSearch(q: Query.TermRegex): Either[String, Set[Int]] = { - try { val regex = q.str.r - val terms = index.termDict.termsForRange("", "\uFFFF") Right( terms @@ -143,11 +141,8 @@ object IndexSearcher { .toSet ) } catch { - case _: PatternSyntaxException => - Left(s"Invalid regex query $q provided") + case _: PatternSyntaxException => Left(s"Invalid regex query $q") } - - } } diff --git a/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala b/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala index 424c2813..eb737d63 100644 --- a/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala +++ b/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala @@ -17,12 +17,16 @@ package pink.cozydev.protosearch import cats.data.NonEmptyList -import cats.syntax.all._ +import cats.syntax.all.* import pink.cozydev.lucille.Query -import scala.collection.mutable.{HashMap => MMap} + +import scala.collection.mutable.HashMap as MMap import pink.cozydev.lucille.MultiQuery import pink.cozydev.protosearch.internal.PositionalIter +import java.util.regex.PatternSyntaxException +import scala.util.matching.Regex + case class Scorer(index: MultiIndex, defaultOR: Boolean = true) { private val defaultIdx: Index = index.indexes(index.schema.defaultField) @@ -51,7 +55,7 @@ case class Scorer(index: MultiIndex, defaultOR: Boolean = true) { case Query.UnaryPlus(q) => accScore(idx, NonEmptyList.one(q)) case q: Query.Proximity => Left(s"Unsupported Proximity encountered in Scorer: $q") case q: Query.Fuzzy => Left(s"Unsupported Fuzzy encountered in Scorer: $q") - case q: Query.TermRegex => Left(s"Unsupported Regex in Scorer: $q") + case q: Query.TermRegex => regexScore(idx, docs, q) case q: Query.MinimumMatch => Left(s"Unsupported MinimumMatch in Scorer: $q") } accScore(defaultIdx, NonEmptyList.one(qs)).map(combineMaps) @@ -103,6 +107,28 @@ case class Scorer(index: MultiIndex, defaultOR: Boolean = true) { case _ => Left(s"Unsupported TermRange error: $q") } + private def regexScore( + idx: Index, + docs: Set[Int], + q: Query.TermRegex, + ): Either[String, NonEmptyList[Map[Int, Double]]] = + try { + val regex = q.str.r + NonEmptyList.fromList(idx.termDict.termsForRange("", "\uFFFF")) match { + case None => Right(NonEmptyList.one(Map.empty[Int, Double])) + case Some(terms) => + val scores = terms + .collect { case term if regex.findFirstMatchIn(term).isDefined => term } + .map(t => idx.scoreTFIDF(docs, t).toMap) + NonEmptyList.fromList(scores) match { + case None => Right(NonEmptyList.one(Map.empty[Int, Double])) + case Some(s) => Right(s) + } + } + } catch { + case _: PatternSyntaxException => Left(s"Invalid regex query $q") + } + private def combineMaps(ms: NonEmptyList[Map[Int, Double]]): List[(Int, Double)] = { val mb = MMap.empty ++ ms.head ms.tail.foreach(m1 => diff --git a/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala index 5557020e..f62fbc74 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala @@ -135,7 +135,7 @@ class FrequencyIndexSearcherSuite extends munit.FunSuite { test("regex fail") { val q = search("/[/") - val err = "Invalid regex query TermRegex([) provided" + val err = "Invalid regex query TermRegex([)" assertEquals( q, Left(err) diff --git a/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala index 063cc823..36ba891d 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala @@ -124,10 +124,9 @@ class MultiIndexSuite extends munit.FunSuite { assertEquals(books, Right(List(peter, eggs))) } - // TODO: Add to Scorer.scala test("regex fail") { val q = search("/[a/") - val err = "Invalid regex query TermRegex([) provided" + val err = "Invalid regex query TermRegex([a)" assertEquals( q, Left(err) diff --git a/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala index ba9b1bb7..17a3590a 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala @@ -200,7 +200,7 @@ class PositionalIndexSearcherSuite extends munit.FunSuite { test("regex fail") { val q = search("/[a/") - val err = "Invalid regex query TermRegex([a) provided" + val err = "Invalid regex query TermRegex([a)" assertEquals( q, Left(err) From b9cbbed8755758a46a47e8eef914f7c0a520cd0f Mon Sep 17 00:00:00 2001 From: VigneshSK17 Date: Mon, 25 Mar 2024 15:08:28 -0400 Subject: [PATCH 6/9] Created termsForRegex method, fixed formatting --- .../pink/cozydev/protosearch/IndexSearcher.scala | 13 +++++-------- .../scala/pink/cozydev/protosearch/Scorer.scala | 12 ++---------- .../protosearch/internal/TermDictionary.scala | 5 +++++ .../protosearch/FrequencyIndexSearcherSuite.scala | 4 ++-- .../pink/cozydev/protosearch/MultiIndexSuite.scala | 2 +- .../protosearch/PositionalIndexSearcherSuite.scala | 4 ++-- 6 files changed, 17 insertions(+), 23 deletions(-) diff --git a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala index f03fe760..463130d6 100644 --- a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala +++ b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala @@ -79,9 +79,8 @@ object IndexSearcher { def search(q: NonEmptyList[Query]): Either[String, Set[Int]] = q.traverse(q => search(q)).map(defaultCombine) - - private def defaultCombine(sets: NonEmptyList[Set[Int]]): Set[Int] = - if (defaultOR) IndexSearcher.unionSets(sets) else IndexSearcher.intersectSets(sets) + private def defaultCombine(sets: NonEmptyList[Set[Int]]): Set[Int] = + if (defaultOR) IndexSearcher.unionSets(sets) else IndexSearcher.intersectSets(sets) def search(q: Query): Either[String, Set[Int]] = q match { case Query.Term(q) => Right(index.docsWithTerm(q).toSet) @@ -130,20 +129,18 @@ object IndexSearcher { } } - private def regexSearch(q: Query.TermRegex): Either[String, Set[Int]] = { + private def regexSearch(q: Query.TermRegex): Either[String, Set[Int]] = try { val regex = q.str.r - val terms = index.termDict.termsForRange("", "\uFFFF") + val terms = index.termDict.termsForRegex(regex) Right( terms - .flatMap(regex.findFirstMatchIn(_)) - .flatMap(m => index.docsWithTerm(m.source.toString)) + .flatMap(m => index.docsWithTerm(m)) .toSet ) } catch { case _: PatternSyntaxException => Left(s"Invalid regex query $q") } - } } def intersectSets(sets: NonEmptyList[Set[Int]]): Set[Int] = diff --git a/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala b/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala index eb737d63..7e7f77ea 100644 --- a/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala +++ b/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala @@ -25,7 +25,6 @@ import pink.cozydev.lucille.MultiQuery import pink.cozydev.protosearch.internal.PositionalIter import java.util.regex.PatternSyntaxException -import scala.util.matching.Regex case class Scorer(index: MultiIndex, defaultOR: Boolean = true) { @@ -114,16 +113,9 @@ case class Scorer(index: MultiIndex, defaultOR: Boolean = true) { ): Either[String, NonEmptyList[Map[Int, Double]]] = try { val regex = q.str.r - NonEmptyList.fromList(idx.termDict.termsForRange("", "\uFFFF")) match { + NonEmptyList.fromList(idx.termDict.termsForRegex(regex)) match { case None => Right(NonEmptyList.one(Map.empty[Int, Double])) - case Some(terms) => - val scores = terms - .collect { case term if regex.findFirstMatchIn(term).isDefined => term } - .map(t => idx.scoreTFIDF(docs, t).toMap) - NonEmptyList.fromList(scores) match { - case None => Right(NonEmptyList.one(Map.empty[Int, Double])) - case Some(s) => Right(s) - } + case Some(terms) => Right(terms.map(idx.scoreTFIDF(docs, _).toMap)) } } catch { case _: PatternSyntaxException => Left(s"Invalid regex query $q") diff --git a/core/src/main/scala/pink/cozydev/protosearch/internal/TermDictionary.scala b/core/src/main/scala/pink/cozydev/protosearch/internal/TermDictionary.scala index 5ce415ab..21be859a 100644 --- a/core/src/main/scala/pink/cozydev/protosearch/internal/TermDictionary.scala +++ b/core/src/main/scala/pink/cozydev/protosearch/internal/TermDictionary.scala @@ -17,6 +17,7 @@ package pink.cozydev.protosearch.internal import scala.annotation.tailrec +import scala.util.matching.Regex final class TermDictionary( private val termDict: Array[String] @@ -46,6 +47,10 @@ final class TermDictionary( bldr.result() } + /** Get the list of terms matching the regex. */ + def termsForRegex(regex: Regex): List[String] = + termDict.filter(regex.findFirstIn(_).isDefined).toList + /** Get the list of terms starting with prefix . */ def indicesForPrefix(prefix: String): Array[Int] = { var i = termIndexWhere(prefix) diff --git a/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala index f62fbc74..71da6906 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/FrequencyIndexSearcherSuite.scala @@ -129,7 +129,7 @@ class FrequencyIndexSearcherSuite extends munit.FunSuite { val results = Set(0, 1, 2) assertEquals( q, - Right(results) + Right(results), ) } @@ -138,7 +138,7 @@ class FrequencyIndexSearcherSuite extends munit.FunSuite { val err = "Invalid regex query TermRegex([)" assertEquals( q, - Left(err) + Left(err), ) } diff --git a/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala index 36ba891d..2cf1c8d4 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/MultiIndexSuite.scala @@ -129,7 +129,7 @@ class MultiIndexSuite extends munit.FunSuite { val err = "Invalid regex query TermRegex([a)" assertEquals( q, - Left(err) + Left(err), ) } diff --git a/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala b/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala index 17a3590a..7b7a3121 100644 --- a/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala +++ b/core/src/test/scala/pink/cozydev/protosearch/PositionalIndexSearcherSuite.scala @@ -194,7 +194,7 @@ class PositionalIndexSearcherSuite extends munit.FunSuite { val results = Set(0, 1) assertEquals( q, - Right(results) + Right(results), ) } @@ -203,7 +203,7 @@ class PositionalIndexSearcherSuite extends munit.FunSuite { val err = "Invalid regex query TermRegex([a)" assertEquals( q, - Left(err) + Left(err), ) } } From 972df7a70802cdf60505c6eca0c61b40ded292b6 Mon Sep 17 00:00:00 2001 From: VigneshSK17 Date: Mon, 25 Mar 2024 15:29:10 -0400 Subject: [PATCH 7/9] Fixed scalafmt issues --- .../main/scala/pink/cozydev/protosearch/Scorer.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala b/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala index 7e7f77ea..f622d2d1 100644 --- a/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala +++ b/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala @@ -20,7 +20,7 @@ import cats.data.NonEmptyList import cats.syntax.all.* import pink.cozydev.lucille.Query -import scala.collection.mutable.HashMap as MMap +import scala.collection.mutable.HashMap import pink.cozydev.lucille.MultiQuery import pink.cozydev.protosearch.internal.PositionalIter @@ -107,9 +107,9 @@ case class Scorer(index: MultiIndex, defaultOR: Boolean = true) { } private def regexScore( - idx: Index, - docs: Set[Int], - q: Query.TermRegex, + idx: Index, + docs: Set[Int], + q: Query.TermRegex, ): Either[String, NonEmptyList[Map[Int, Double]]] = try { val regex = q.str.r @@ -122,7 +122,7 @@ case class Scorer(index: MultiIndex, defaultOR: Boolean = true) { } private def combineMaps(ms: NonEmptyList[Map[Int, Double]]): List[(Int, Double)] = { - val mb = MMap.empty ++ ms.head + val mb = HashMap.empty ++ ms.head ms.tail.foreach(m1 => m1.foreach { case (k: Int, v: Double) => mb.update(k, v + mb.getOrElse(k, 0.0)) } ) From b9fb0648e0b75eb7d4d2a7a7fd51f666082acd8e Mon Sep 17 00:00:00 2001 From: VigneshSK17 Date: Tue, 26 Mar 2024 16:18:40 -0400 Subject: [PATCH 8/9] Fixed issues with CI --- .../src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala index 463130d6..ce551250 100644 --- a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala +++ b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala @@ -79,8 +79,6 @@ object IndexSearcher { def search(q: NonEmptyList[Query]): Either[String, Set[Int]] = q.traverse(q => search(q)).map(defaultCombine) - private def defaultCombine(sets: NonEmptyList[Set[Int]]): Set[Int] = - if (defaultOR) IndexSearcher.unionSets(sets) else IndexSearcher.intersectSets(sets) def search(q: Query): Either[String, Set[Int]] = q match { case Query.Term(q) => Right(index.docsWithTerm(q).toSet) From b38bbf27e4e420b43a98900e76b42580319c7518 Mon Sep 17 00:00:00 2001 From: VigneshSK17 Date: Wed, 27 Mar 2024 15:22:30 -0400 Subject: [PATCH 9/9] Cleaned up try-catch block --- .../cozydev/protosearch/IndexSearcher.scala | 26 ++++++++++--------- .../pink/cozydev/protosearch/Scorer.scala | 19 ++++++++------ 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala index ce551250..af279fe6 100644 --- a/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala +++ b/core/src/main/scala/pink/cozydev/protosearch/IndexSearcher.scala @@ -127,18 +127,20 @@ object IndexSearcher { } } - private def regexSearch(q: Query.TermRegex): Either[String, Set[Int]] = - try { - val regex = q.str.r - val terms = index.termDict.termsForRegex(regex) - Right( - terms - .flatMap(m => index.docsWithTerm(m)) - .toSet - ) - } catch { - case _: PatternSyntaxException => Left(s"Invalid regex query $q") - } + private def regexSearch(q: Query.TermRegex): Either[String, Set[Int]] = { + val regex = + try + q.str.r + catch { + case _: PatternSyntaxException => return Left(s"Invalid regex query $q") + } + val terms = index.termDict.termsForRegex(regex) + Right( + terms + .flatMap(m => index.docsWithTerm(m)) + .toSet + ) + } } def intersectSets(sets: NonEmptyList[Set[Int]]): Set[Int] = diff --git a/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala b/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala index f622d2d1..d73269ae 100644 --- a/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala +++ b/core/src/main/scala/pink/cozydev/protosearch/Scorer.scala @@ -110,16 +110,19 @@ case class Scorer(index: MultiIndex, defaultOR: Boolean = true) { idx: Index, docs: Set[Int], q: Query.TermRegex, - ): Either[String, NonEmptyList[Map[Int, Double]]] = - try { - val regex = q.str.r - NonEmptyList.fromList(idx.termDict.termsForRegex(regex)) match { - case None => Right(NonEmptyList.one(Map.empty[Int, Double])) - case Some(terms) => Right(terms.map(idx.scoreTFIDF(docs, _).toMap)) + ): Either[String, NonEmptyList[Map[Int, Double]]] = { + val regex = + try + q.str.r + catch { + case _: PatternSyntaxException => return Left(s"Invalid regex query $q") } - } catch { - case _: PatternSyntaxException => Left(s"Invalid regex query $q") + + NonEmptyList.fromList(idx.termDict.termsForRegex(regex)) match { + case None => Right(NonEmptyList.one(Map.empty[Int, Double])) + case Some(terms) => Right(terms.map(idx.scoreTFIDF(docs, _).toMap)) } + } private def combineMaps(ms: NonEmptyList[Map[Int, Double]]): List[(Int, Double)] = { val mb = HashMap.empty ++ ms.head