From cc0599ef24a3684cb510bee12ff55b40441864fd Mon Sep 17 00:00:00 2001 From: Michael Chavinda Date: Fri, 17 Jan 2025 16:52:51 -0800 Subject: [PATCH] Use vector as intermediate representation for row functions. --- src/Data/DataFrame/Internal/Function.hs | 25 +++-- src/Data/DataFrame/Internal/Row.hs | 97 ++++++++------------ src/Data/DataFrame/Operations/Aggregation.hs | 23 +++-- src/Data/DataFrame/Operations/Sorting.hs | 2 +- 4 files changed, 76 insertions(+), 71 deletions(-) diff --git a/src/Data/DataFrame/Internal/Function.hs b/src/Data/DataFrame/Internal/Function.hs index 5ebcf7e..a076ade 100644 --- a/src/Data/DataFrame/Internal/Function.hs +++ b/src/Data/DataFrame/Internal/Function.hs @@ -6,10 +6,13 @@ {-# LANGUAGE GADTs #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE InstanceSigs #-} +{-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE PatternSynonyms #-} module Data.DataFrame.Internal.Function where import qualified Data.Text as T +import qualified Data.Vector as V import Data.DataFrame.Internal.Types import Data.Typeable ( Typeable, type (:~:)(Refl) ) @@ -52,19 +55,29 @@ instance {-# INCOHERENT #-} (Columnable a, Columnable b, Columnable c, Columnabl func :: forall fn . WrapFunction fn => fn -> Function func = wrapFunction -funcApply :: forall c . (Columnable c) => [RowValue] -> Function -> c -funcApply [] _ = error "Empty args" -funcApply [Value (x :: a')] (F1 (f :: (a -> b))) = case testEquality (typeRep @a') (typeRep @a) of +pattern Empty :: V.Vector a +pattern Empty <- (V.null -> True) where Empty = V.empty + +uncons :: V.Vector a -> Maybe (a, V.Vector a) +uncons Empty = Nothing +uncons v = Just (V.unsafeHead v, V.unsafeTail v) + +pattern (:<|) :: a -> V.Vector a -> V.Vector a +pattern x :<| xs <- (uncons -> Just (x, xs)) + +funcApply :: forall c . (Columnable c) => V.Vector RowValue -> Function -> c +funcApply Empty _ = error "Empty args" +funcApply (Value (x :: a') :<| Empty) (F1 (f :: (a -> b))) = case testEquality (typeRep @a') (typeRep @a) of Just Refl -> case testEquality (typeOf (f x)) (typeRep @c) of Just Refl -> f x Nothing -> error "Result type mismatch" Nothing -> error "Arg type mismatch" -funcApply (Value (x :: a') : xs) (F2 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of +funcApply (Value (x :: a') :<| xs) (F2 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of Just Refl -> funcApply xs (F1 (f x)) Nothing -> error "Arg type mismatch" -funcApply (Value (x :: a') : xs) (F3 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of +funcApply (Value (x :: a') :<| xs) (F3 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of Just Refl -> funcApply xs (F2 (f x)) Nothing -> error "Arg type mismatch" -funcApply (Value (x :: a') : xs) (F4 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of +funcApply (Value (x :: a') :<| xs) (F4 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of Just Refl -> funcApply xs (F3 (f x)) Nothing -> error "Arg type mismatch" diff --git a/src/Data/DataFrame/Internal/Row.hs b/src/Data/DataFrame/Internal/Row.hs index 388d063..cfdb287 100644 --- a/src/Data/DataFrame/Internal/Row.hs +++ b/src/Data/DataFrame/Internal/Row.hs @@ -6,84 +6,65 @@ import qualified Data.Map as M import qualified Data.Set as S import qualified Data.Text as T import qualified Data.Vector as V +import qualified Data.Vector.Generic as VG import qualified Data.Vector.Unboxed as VU +import qualified Data.Vector.Algorithms.Merge as VA import Control.Exception (throw) +import Control.Monad.ST (runST) import Data.DataFrame.Errors (DataFrameException(..)) import Data.DataFrame.Internal.Column import Data.DataFrame.Internal.DataFrame import Data.DataFrame.Internal.Types +import Data.Function (on) -type Row = [RowValue] +type Row = V.Vector RowValue toRowList :: [T.Text] -> DataFrame -> [Row] toRowList names df = let nameSet = S.fromList names in map (mkRowRep df nameSet) [0..(fst (dataframeDimensions df) - 1)] -mkRowFromArgs :: [T.Text] -> DataFrame -> Int -> [RowValue] -mkRowFromArgs names df i = foldr go [] names +toRowVector :: [T.Text] -> DataFrame -> V.Vector Row +toRowVector names df = let + nameSet = S.fromList names + in V.generate (fst (dataframeDimensions df)) (mkRowRep df nameSet) + +mkRowFromArgs :: [T.Text] -> DataFrame -> Int -> Row +mkRowFromArgs names df i = V.map get (V.fromList names) where - go name acc = case getColumn name df of + get name = case getColumn name df of Nothing -> throw $ ColumnNotFoundException name "[INTERNAL] mkRowFromArgs" (map fst $ M.toList $ columnIndices df) - Just (BoxedColumn column) -> toRowValue (column V.! i) : acc - Just (UnboxedColumn column) -> toRowValue (column VU.! i) : acc + Just (BoxedColumn column) -> toRowValue (column V.! i) + Just (UnboxedColumn column) -> toRowValue (column VU.! i) mkRowRep :: DataFrame -> S.Set T.Text -> Int -> Row -mkRowRep df names i = reverse $ V.ifoldl' go [] (columns df) +mkRowRep df names i = V.generate (S.size names) (\index -> get index (names' V.! index)) where - indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df)) - go acc k Nothing = acc - go acc k (Just (BoxedColumn c)) = - if S.notMember (indexMap M.! k) names - then acc - else case c V.!? i of - Just e -> toRowValue e : acc - Nothing -> - error $ - "Column " - ++ T.unpack (indexMap M.! k) - ++ " has less items than " - ++ "the other columns at index " - ++ show i - go acc k (Just (UnboxedColumn c)) = - if S.notMember (indexMap M.! k) names - then acc - else case c VU.!? i of - Just e -> toRowValue e : acc - Nothing -> - error $ - "Column " - ++ T.unpack (indexMap M.! k) - ++ " has less items than " - ++ "the other columns at index " - ++ show i - go acc k (Just (GroupedBoxedColumn c)) = - if S.notMember (indexMap M.! k) names - then acc - else case c V.!? i of - Just e -> toRowValue e : acc - Nothing -> - error $ - "Column " - ++ T.unpack (indexMap M.! k) - ++ " has less items than " - ++ "the other columns at index " - ++ show i - go acc k (Just (GroupedUnboxedColumn c)) = - if S.notMember (indexMap M.! k) names - then acc - else case c V.!? i of - Just e -> toRowValue e : acc - Nothing -> - error $ - "Column " - ++ T.unpack (indexMap M.! k) + inOrderIndexes = map fst $ L.sortBy (compare `on` snd) $ M.toList (columnIndices df) + names' = V.fromList [n | n <- inOrderIndexes, S.member n names] + throwError name = error $ "Column " + ++ T.unpack name ++ " has less items than " ++ "the other columns at index " ++ show i + get index name = case getColumn name df of + Just (BoxedColumn c) -> case c V.!? index of + Just e -> toRowValue e + Nothing -> throwError name + Just (UnboxedColumn c) -> case c VU.!? index of + Just e -> toRowValue e + Nothing -> throwError name + Just (GroupedBoxedColumn c) -> case c V.!? index of + Just e -> toRowValue e + Nothing -> throwError name + Just (GroupedUnboxedColumn c) -> case c V.!? index of + Just e -> toRowValue e + Nothing -> throwError name -sortedIndexes' :: Bool -> [Row] -> VU.Vector Int -sortedIndexes' asc rows = VU.fromList - $ map fst - $ L.sortBy (\(a, b) (a', b') -> (if asc then compare else flip compare) b b') (zip [0..] rows) +sortedIndexes' :: Bool -> V.Vector Row -> VU.Vector Int +sortedIndexes' asc rows = runST $ do + withIndexes <- VG.thaw (V.indexed rows) + VA.sortBy (\(a, b) (a', b') -> (if asc then compare else flip compare) b b') withIndexes + sorted <- VG.unsafeFreeze withIndexes + return $ VU.generate (VG.length rows) (\i -> fst (sorted VG.! i)) diff --git a/src/Data/DataFrame/Operations/Aggregation.hs b/src/Data/DataFrame/Operations/Aggregation.hs index 93880c2..29d1e33 100644 --- a/src/Data/DataFrame/Operations/Aggregation.hs +++ b/src/Data/DataFrame/Operations/Aggregation.hs @@ -33,7 +33,7 @@ import Data.Function ((&)) import Data.Hashable import Data.Maybe import Data.Type.Equality (type (:~:)(Refl), TestEquality(..)) -import Type.Reflection (typeRep) +import Type.Reflection (typeRep, typeOf) -- | O(k * n) groups the dataframe by the given rows aggregating the remaining rows -- into vector that should be reduced later. @@ -75,15 +75,15 @@ groupBy names df groupingColumns = columnNames df L.\\ names mkRowRep :: DataFrame -> S.Set T.Text -> Int -> Int -mkRowRep df names i = hash $ V.ifoldl' go "" (columns df) +mkRowRep df names i = hash $ V.ifoldl' go [] (columns df) where indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df)) go acc k Nothing = acc - go acc k (Just (BoxedColumn c)) = + go acc k (Just (BoxedColumn (c :: V.Vector a))) = if S.notMember (indexMap M.! k) names then acc else case c V.!? i of - Just e -> acc <> (T.pack . show) e + Just e -> hash' @a e : acc Nothing -> error $ "Column " @@ -91,11 +91,11 @@ mkRowRep df names i = hash $ V.ifoldl' go "" (columns df) ++ " has less items than " ++ "the other columns at index " ++ show i - go acc k (Just (UnboxedColumn c)) = + go acc k (Just (UnboxedColumn (c :: VU.Vector a))) = if S.notMember (indexMap M.! k) names then acc else case c VU.!? i of - Just e -> acc <> (T.pack . show) e + Just e -> hash' @a e : acc Nothing -> error $ "Column " @@ -104,6 +104,17 @@ mkRowRep df names i = hash $ V.ifoldl' go "" (columns df) ++ "the other columns at index " ++ show i +-- | This hash function returns the hash when given a non numeric type but +-- the value when given a numeric. +hash' :: Columnable a => a -> Double +hash' value = case testEquality (typeOf value) (typeRep @Double) of + Just Refl -> value + Nothing -> case testEquality (typeOf value) (typeRep @Int) of + Just Refl -> fromIntegral value + Nothing -> case testEquality (typeOf value) (typeRep @T.Text) of + Just Refl -> fromIntegral $ hash value + Nothing -> fromIntegral $ hash (show value) + mkGroupedColumns :: VU.Vector Int -> DataFrame -> DataFrame -> T.Text -> DataFrame mkGroupedColumns indices df acc name = case (V.!) (columns df) (columnIndices df M.! name) of diff --git a/src/Data/DataFrame/Operations/Sorting.hs b/src/Data/DataFrame/Operations/Sorting.hs index 21487bc..5ea347f 100644 --- a/src/Data/DataFrame/Operations/Sorting.hs +++ b/src/Data/DataFrame/Operations/Sorting.hs @@ -28,6 +28,6 @@ sortBy order names df | otherwise = let -- TODO: Remove the SortOrder defintion from operations so we can share it between here and internal and -- we don't have to do this Bool mapping. - indexes = sortedIndexes' (order == Ascending) (toRowList names df) + indexes = sortedIndexes' (order == Ascending) (toRowVector names df) pick idxs col = atIndicesStable idxs <$> col in df {columns = V.map (pick indexes) (columns df)}