Use vector as intermediate representation for row functions.

mchav · Jan 18, 2025 · cc0599e · cc0599e
1 parent be5190a
commit cc0599e
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 71 deletions.
diff --git a/src/Data/DataFrame/Internal/Function.hs b/src/Data/DataFrame/Internal/Function.hs
@@ -6,10 +6,13 @@
 {-# LANGUAGE GADTs #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE InstanceSigs #-}
+{-# LANGUAGE ViewPatterns #-}
+{-# LANGUAGE PatternSynonyms #-}
 
 module Data.DataFrame.Internal.Function where
 
 import qualified Data.Text as T
+import qualified Data.Vector as V
 
 import Data.DataFrame.Internal.Types
 import Data.Typeable ( Typeable, type (:~:)(Refl) )
@@ -52,19 +55,29 @@ instance {-# INCOHERENT #-} (Columnable a, Columnable b, Columnable c, Columnabl
 func :: forall fn . WrapFunction fn => fn -> Function
 func = wrapFunction
 
-funcApply :: forall c . (Columnable c) => [RowValue] -> Function ->  c
-funcApply [] _ = error "Empty args"
-funcApply [Value (x :: a')] (F1 (f :: (a -> b))) = case testEquality (typeRep @a') (typeRep @a) of
+pattern Empty :: V.Vector a
+pattern Empty <- (V.null -> True) where Empty = V.empty 
+
+uncons :: V.Vector a -> Maybe (a, V.Vector a)
+uncons Empty = Nothing
+uncons v     = Just (V.unsafeHead v, V.unsafeTail v)
+
+pattern (:<|)  :: a -> V.Vector a -> V.Vector a
+pattern x :<| xs <- (uncons -> Just (x, xs))
+
+funcApply :: forall c . (Columnable c) => V.Vector RowValue -> Function ->  c
+funcApply Empty _ = error "Empty args"
+funcApply (Value (x :: a') :<| Empty) (F1 (f :: (a -> b))) = case testEquality (typeRep @a') (typeRep @a) of
         Just Refl -> case testEquality (typeOf (f x)) (typeRep @c) of
             Just Refl -> f x
             Nothing -> error "Result type mismatch"
         Nothing -> error "Arg type mismatch"
-funcApply (Value (x :: a') : xs) (F2 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
+funcApply (Value (x :: a') :<| xs) (F2 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
         Just Refl -> funcApply xs (F1 (f x))
         Nothing -> error "Arg type mismatch"
-funcApply (Value (x :: a') : xs) (F3 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
+funcApply (Value (x :: a') :<| xs) (F3 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
         Just Refl -> funcApply xs (F2 (f x))
         Nothing -> error "Arg type mismatch"
-funcApply (Value (x :: a') : xs) (F4 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
+funcApply (Value (x :: a') :<| xs) (F4 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
         Just Refl -> funcApply xs (F3 (f x))
         Nothing -> error "Arg type mismatch"
diff --git a/src/Data/DataFrame/Internal/Row.hs b/src/Data/DataFrame/Internal/Row.hs
@@ -6,84 +6,65 @@ import qualified Data.Map as M
 import qualified Data.Set as S
 import qualified Data.Text as T
 import qualified Data.Vector as V
+import qualified Data.Vector.Generic as VG
 import qualified Data.Vector.Unboxed as VU
+import qualified Data.Vector.Algorithms.Merge as VA
 
 import Control.Exception (throw)
+import Control.Monad.ST (runST)
 import Data.DataFrame.Errors (DataFrameException(..))
 import Data.DataFrame.Internal.Column
 import Data.DataFrame.Internal.DataFrame
 import Data.DataFrame.Internal.Types
+import Data.Function (on)
 
-type Row = [RowValue]
+type Row = V.Vector RowValue
 
 toRowList :: [T.Text] -> DataFrame -> [Row]
 toRowList names df = let
     nameSet = S.fromList names
   in map (mkRowRep df nameSet) [0..(fst (dataframeDimensions df) - 1)]
 
-mkRowFromArgs :: [T.Text] -> DataFrame -> Int -> [RowValue]
-mkRowFromArgs names df i = foldr go [] names
+toRowVector :: [T.Text] -> DataFrame -> V.Vector Row
+toRowVector names df = let
+    nameSet = S.fromList names
+  in V.generate (fst (dataframeDimensions df)) (mkRowRep df nameSet)
+
+mkRowFromArgs :: [T.Text] -> DataFrame -> Int -> Row
+mkRowFromArgs names df i = V.map get (V.fromList names)
   where
-    go name acc = case getColumn name df of
+    get name = case getColumn name df of
       Nothing -> throw $ ColumnNotFoundException name "[INTERNAL] mkRowFromArgs" (map fst $ M.toList $ columnIndices df)
-      Just (BoxedColumn column) -> toRowValue (column V.! i) : acc
-      Just (UnboxedColumn column) -> toRowValue (column VU.! i) : acc
+      Just (BoxedColumn column) -> toRowValue (column V.! i)
+      Just (UnboxedColumn column) -> toRowValue (column VU.! i)
 
 mkRowRep :: DataFrame -> S.Set T.Text -> Int -> Row
-mkRowRep df names i = reverse $ V.ifoldl' go [] (columns df)
+mkRowRep df names i = V.generate (S.size names) (\index -> get index (names' V.! index))
   where
-    indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
-    go acc k Nothing = acc
-    go acc k (Just (BoxedColumn c)) =
-      if S.notMember (indexMap M.! k) names
-        then acc
-        else case c V.!? i of
-          Just e -> toRowValue e : acc
-          Nothing ->
-            error $
-              "Column "
-                ++ T.unpack (indexMap M.! k)
-                ++ " has less items than "
-                ++ "the other columns at index "
-                ++ show i
-    go acc k (Just (UnboxedColumn c)) =
-      if S.notMember (indexMap M.! k) names
-        then acc
-        else case c VU.!? i of
-          Just e -> toRowValue e : acc
-          Nothing ->
-            error $
-              "Column "
-                ++ T.unpack (indexMap M.! k)
-                ++ " has less items than "
-                ++ "the other columns at index "
-                ++ show i
-    go acc k (Just (GroupedBoxedColumn c)) =
-      if S.notMember (indexMap M.! k) names
-        then acc
-        else case c V.!? i of
-          Just e -> toRowValue e : acc
-          Nothing ->
-            error $
-              "Column "
-                ++ T.unpack (indexMap M.! k)
-                ++ " has less items than "
-                ++ "the other columns at index "
-                ++ show i
-    go acc k (Just (GroupedUnboxedColumn c)) =
-      if S.notMember (indexMap M.! k) names
-        then acc
-        else case c V.!? i of
-          Just e -> toRowValue e : acc
-          Nothing ->
-            error $
-              "Column "
-                ++ T.unpack (indexMap M.! k)
+    inOrderIndexes = map fst $ L.sortBy (compare `on` snd) $ M.toList (columnIndices df)
+    names' = V.fromList [n | n <- inOrderIndexes, S.member n names]
+    throwError name = error $ "Column "
+                ++ T.unpack name
                 ++ " has less items than "
                 ++ "the other columns at index "
                 ++ show i
+    get index name = case getColumn name df of 
+      Just (BoxedColumn c) -> case c V.!? index of
+        Just e -> toRowValue e
+        Nothing -> throwError name
+      Just (UnboxedColumn c) -> case c VU.!? index of
+        Just e -> toRowValue e
+        Nothing -> throwError name
+      Just (GroupedBoxedColumn c) -> case c V.!? index of
+        Just e -> toRowValue e
+        Nothing -> throwError name
+      Just (GroupedUnboxedColumn c) -> case c V.!? index of
+        Just e -> toRowValue e
+        Nothing -> throwError name
 
-sortedIndexes' :: Bool -> [Row] -> VU.Vector Int
-sortedIndexes' asc rows = VU.fromList
-                        $ map fst
-                        $ L.sortBy (\(a, b) (a', b') -> (if asc then compare else flip compare) b b') (zip [0..] rows)
+sortedIndexes' :: Bool -> V.Vector Row -> VU.Vector Int
+sortedIndexes' asc rows = runST $ do
+  withIndexes <- VG.thaw (V.indexed rows)
+  VA.sortBy (\(a, b) (a', b') -> (if asc then compare else flip compare) b b') withIndexes
+  sorted <- VG.unsafeFreeze withIndexes
+  return $ VU.generate (VG.length rows) (\i -> fst (sorted VG.! i))
diff --git a/src/Data/DataFrame/Operations/Aggregation.hs b/src/Data/DataFrame/Operations/Aggregation.hs
@@ -33,7 +33,7 @@ import Data.Function ((&))
 import Data.Hashable
 import Data.Maybe
 import Data.Type.Equality (type (:~:)(Refl), TestEquality(..))
-import Type.Reflection (typeRep)
+import Type.Reflection (typeRep, typeOf)
 
 -- | O(k * n) groups the dataframe by the given rows aggregating the remaining rows
 -- into vector that should be reduced later.
@@ -75,27 +75,27 @@ groupBy names df
     groupingColumns = columnNames df L.\\ names
 
 mkRowRep :: DataFrame -> S.Set T.Text -> Int -> Int
-mkRowRep df names i = hash $ V.ifoldl' go "" (columns df)
+mkRowRep df names i = hash $ V.ifoldl' go [] (columns df)
   where
     indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
     go acc k Nothing = acc
-    go acc k (Just (BoxedColumn c)) =
+    go acc k (Just (BoxedColumn (c :: V.Vector a))) =
       if S.notMember (indexMap M.! k) names
         then acc
         else case c V.!? i of
-          Just e -> acc <> (T.pack . show) e
+          Just e -> hash' @a e : acc
           Nothing ->
             error $
               "Column "
                 ++ T.unpack (indexMap M.! k)
                 ++ " has less items than "
                 ++ "the other columns at index "
                 ++ show i
-    go acc k (Just (UnboxedColumn c)) =
+    go acc k (Just (UnboxedColumn (c :: VU.Vector a))) =
       if S.notMember (indexMap M.! k) names
         then acc
         else case c VU.!? i of
-          Just e -> acc <> (T.pack . show) e
+          Just e -> hash' @a e : acc
           Nothing ->
             error $
               "Column "
@@ -104,6 +104,17 @@ mkRowRep df names i = hash $ V.ifoldl' go "" (columns df)
                 ++ "the other columns at index "
                 ++ show i
 
+-- | This hash function returns the hash when given a non numeric type but
+-- the value when given a numeric.
+hash' :: Columnable a => a -> Double
+hash' value = case testEquality (typeOf value) (typeRep @Double) of
+  Just Refl -> value
+  Nothing -> case testEquality (typeOf value) (typeRep @Int) of
+    Just Refl -> fromIntegral value
+    Nothing -> case testEquality (typeOf value) (typeRep @T.Text) of
+      Just Refl -> fromIntegral $ hash value
+      Nothing -> fromIntegral $ hash (show value)
+
 mkGroupedColumns :: VU.Vector Int -> DataFrame -> DataFrame -> T.Text -> DataFrame
 mkGroupedColumns indices df acc name =
   case (V.!) (columns df) (columnIndices df M.! name) of

diff --git a/src/Data/DataFrame/Operations/Sorting.hs b/src/Data/DataFrame/Operations/Sorting.hs
@@ -28,6 +28,6 @@ sortBy order names df
   | otherwise = let
       -- TODO: Remove the SortOrder defintion from operations so we can share it between here and internal and
       -- we don't have to do this Bool mapping.
-      indexes = sortedIndexes' (order == Ascending) (toRowList names df)
+      indexes = sortedIndexes' (order == Ascending) (toRowVector names df)
       pick idxs col = atIndicesStable idxs <$> col
     in df {columns = V.map (pick indexes) (columns df)}