Skip to content

Commit

Permalink
Use vector as intermediate representation for row functions.
Browse files Browse the repository at this point in the history
  • Loading branch information
mchav committed Jan 18, 2025
1 parent be5190a commit cc0599e
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 71 deletions.
25 changes: 19 additions & 6 deletions src/Data/DataFrame/Internal/Function.hs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@
{-# LANGUAGE GADTs #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE PatternSynonyms #-}

module Data.DataFrame.Internal.Function where

import qualified Data.Text as T
import qualified Data.Vector as V

import Data.DataFrame.Internal.Types
import Data.Typeable ( Typeable, type (:~:)(Refl) )
Expand Down Expand Up @@ -52,19 +55,29 @@ instance {-# INCOHERENT #-} (Columnable a, Columnable b, Columnable c, Columnabl
func :: forall fn . WrapFunction fn => fn -> Function
func = wrapFunction

funcApply :: forall c . (Columnable c) => [RowValue] -> Function -> c
funcApply [] _ = error "Empty args"
funcApply [Value (x :: a')] (F1 (f :: (a -> b))) = case testEquality (typeRep @a') (typeRep @a) of
pattern Empty :: V.Vector a
pattern Empty <- (V.null -> True) where Empty = V.empty

uncons :: V.Vector a -> Maybe (a, V.Vector a)
uncons Empty = Nothing
uncons v = Just (V.unsafeHead v, V.unsafeTail v)

pattern (:<|) :: a -> V.Vector a -> V.Vector a
pattern x :<| xs <- (uncons -> Just (x, xs))

funcApply :: forall c . (Columnable c) => V.Vector RowValue -> Function -> c
funcApply Empty _ = error "Empty args"
funcApply (Value (x :: a') :<| Empty) (F1 (f :: (a -> b))) = case testEquality (typeRep @a') (typeRep @a) of
Just Refl -> case testEquality (typeOf (f x)) (typeRep @c) of
Just Refl -> f x
Nothing -> error "Result type mismatch"
Nothing -> error "Arg type mismatch"
funcApply (Value (x :: a') : xs) (F2 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
funcApply (Value (x :: a') :<| xs) (F2 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
Just Refl -> funcApply xs (F1 (f x))
Nothing -> error "Arg type mismatch"
funcApply (Value (x :: a') : xs) (F3 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
funcApply (Value (x :: a') :<| xs) (F3 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
Just Refl -> funcApply xs (F2 (f x))
Nothing -> error "Arg type mismatch"
funcApply (Value (x :: a') : xs) (F4 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
funcApply (Value (x :: a') :<| xs) (F4 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
Just Refl -> funcApply xs (F3 (f x))
Nothing -> error "Arg type mismatch"
97 changes: 39 additions & 58 deletions src/Data/DataFrame/Internal/Row.hs
Original file line number Diff line number Diff line change
Expand Up @@ -6,84 +6,65 @@ import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Vector.Generic as VG
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector.Algorithms.Merge as VA

import Control.Exception (throw)
import Control.Monad.ST (runST)
import Data.DataFrame.Errors (DataFrameException(..))
import Data.DataFrame.Internal.Column
import Data.DataFrame.Internal.DataFrame
import Data.DataFrame.Internal.Types
import Data.Function (on)

type Row = [RowValue]
type Row = V.Vector RowValue

toRowList :: [T.Text] -> DataFrame -> [Row]
toRowList names df = let
nameSet = S.fromList names
in map (mkRowRep df nameSet) [0..(fst (dataframeDimensions df) - 1)]

mkRowFromArgs :: [T.Text] -> DataFrame -> Int -> [RowValue]
mkRowFromArgs names df i = foldr go [] names
toRowVector :: [T.Text] -> DataFrame -> V.Vector Row
toRowVector names df = let
nameSet = S.fromList names
in V.generate (fst (dataframeDimensions df)) (mkRowRep df nameSet)

mkRowFromArgs :: [T.Text] -> DataFrame -> Int -> Row
mkRowFromArgs names df i = V.map get (V.fromList names)
where
go name acc = case getColumn name df of
get name = case getColumn name df of
Nothing -> throw $ ColumnNotFoundException name "[INTERNAL] mkRowFromArgs" (map fst $ M.toList $ columnIndices df)
Just (BoxedColumn column) -> toRowValue (column V.! i) : acc
Just (UnboxedColumn column) -> toRowValue (column VU.! i) : acc
Just (BoxedColumn column) -> toRowValue (column V.! i)
Just (UnboxedColumn column) -> toRowValue (column VU.! i)

mkRowRep :: DataFrame -> S.Set T.Text -> Int -> Row
mkRowRep df names i = reverse $ V.ifoldl' go [] (columns df)
mkRowRep df names i = V.generate (S.size names) (\index -> get index (names' V.! index))
where
indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
go acc k Nothing = acc
go acc k (Just (BoxedColumn c)) =
if S.notMember (indexMap M.! k) names
then acc
else case c V.!? i of
Just e -> toRowValue e : acc
Nothing ->
error $
"Column "
++ T.unpack (indexMap M.! k)
++ " has less items than "
++ "the other columns at index "
++ show i
go acc k (Just (UnboxedColumn c)) =
if S.notMember (indexMap M.! k) names
then acc
else case c VU.!? i of
Just e -> toRowValue e : acc
Nothing ->
error $
"Column "
++ T.unpack (indexMap M.! k)
++ " has less items than "
++ "the other columns at index "
++ show i
go acc k (Just (GroupedBoxedColumn c)) =
if S.notMember (indexMap M.! k) names
then acc
else case c V.!? i of
Just e -> toRowValue e : acc
Nothing ->
error $
"Column "
++ T.unpack (indexMap M.! k)
++ " has less items than "
++ "the other columns at index "
++ show i
go acc k (Just (GroupedUnboxedColumn c)) =
if S.notMember (indexMap M.! k) names
then acc
else case c V.!? i of
Just e -> toRowValue e : acc
Nothing ->
error $
"Column "
++ T.unpack (indexMap M.! k)
inOrderIndexes = map fst $ L.sortBy (compare `on` snd) $ M.toList (columnIndices df)
names' = V.fromList [n | n <- inOrderIndexes, S.member n names]
throwError name = error $ "Column "
++ T.unpack name
++ " has less items than "
++ "the other columns at index "
++ show i
get index name = case getColumn name df of
Just (BoxedColumn c) -> case c V.!? index of
Just e -> toRowValue e
Nothing -> throwError name
Just (UnboxedColumn c) -> case c VU.!? index of
Just e -> toRowValue e
Nothing -> throwError name
Just (GroupedBoxedColumn c) -> case c V.!? index of
Just e -> toRowValue e
Nothing -> throwError name
Just (GroupedUnboxedColumn c) -> case c V.!? index of
Just e -> toRowValue e
Nothing -> throwError name

sortedIndexes' :: Bool -> [Row] -> VU.Vector Int
sortedIndexes' asc rows = VU.fromList
$ map fst
$ L.sortBy (\(a, b) (a', b') -> (if asc then compare else flip compare) b b') (zip [0..] rows)
sortedIndexes' :: Bool -> V.Vector Row -> VU.Vector Int
sortedIndexes' asc rows = runST $ do
withIndexes <- VG.thaw (V.indexed rows)
VA.sortBy (\(a, b) (a', b') -> (if asc then compare else flip compare) b b') withIndexes
sorted <- VG.unsafeFreeze withIndexes
return $ VU.generate (VG.length rows) (\i -> fst (sorted VG.! i))
23 changes: 17 additions & 6 deletions src/Data/DataFrame/Operations/Aggregation.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import Data.Function ((&))
import Data.Hashable
import Data.Maybe
import Data.Type.Equality (type (:~:)(Refl), TestEquality(..))
import Type.Reflection (typeRep)
import Type.Reflection (typeRep, typeOf)

-- | O(k * n) groups the dataframe by the given rows aggregating the remaining rows
-- into vector that should be reduced later.
Expand Down Expand Up @@ -75,27 +75,27 @@ groupBy names df
groupingColumns = columnNames df L.\\ names

mkRowRep :: DataFrame -> S.Set T.Text -> Int -> Int
mkRowRep df names i = hash $ V.ifoldl' go "" (columns df)
mkRowRep df names i = hash $ V.ifoldl' go [] (columns df)
where
indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
go acc k Nothing = acc
go acc k (Just (BoxedColumn c)) =
go acc k (Just (BoxedColumn (c :: V.Vector a))) =
if S.notMember (indexMap M.! k) names
then acc
else case c V.!? i of
Just e -> acc <> (T.pack . show) e
Just e -> hash' @a e : acc
Nothing ->
error $
"Column "
++ T.unpack (indexMap M.! k)
++ " has less items than "
++ "the other columns at index "
++ show i
go acc k (Just (UnboxedColumn c)) =
go acc k (Just (UnboxedColumn (c :: VU.Vector a))) =
if S.notMember (indexMap M.! k) names
then acc
else case c VU.!? i of
Just e -> acc <> (T.pack . show) e
Just e -> hash' @a e : acc
Nothing ->
error $
"Column "
Expand All @@ -104,6 +104,17 @@ mkRowRep df names i = hash $ V.ifoldl' go "" (columns df)
++ "the other columns at index "
++ show i

-- | This hash function returns the hash when given a non numeric type but
-- the value when given a numeric.
hash' :: Columnable a => a -> Double
hash' value = case testEquality (typeOf value) (typeRep @Double) of
Just Refl -> value
Nothing -> case testEquality (typeOf value) (typeRep @Int) of
Just Refl -> fromIntegral value
Nothing -> case testEquality (typeOf value) (typeRep @T.Text) of
Just Refl -> fromIntegral $ hash value
Nothing -> fromIntegral $ hash (show value)

mkGroupedColumns :: VU.Vector Int -> DataFrame -> DataFrame -> T.Text -> DataFrame
mkGroupedColumns indices df acc name =
case (V.!) (columns df) (columnIndices df M.! name) of
Expand Down
2 changes: 1 addition & 1 deletion src/Data/DataFrame/Operations/Sorting.hs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ sortBy order names df
| otherwise = let
-- TODO: Remove the SortOrder defintion from operations so we can share it between here and internal and
-- we don't have to do this Bool mapping.
indexes = sortedIndexes' (order == Ascending) (toRowList names df)
indexes = sortedIndexes' (order == Ascending) (toRowVector names df)
pick idxs col = atIndicesStable idxs <$> col
in df {columns = V.map (pick indexes) (columns df)}

0 comments on commit cc0599e

Please sign in to comment.