Skip to content

Commit

Permalink
Copies are the devil. Remove grow logic and calculate num lines befor…
Browse files Browse the repository at this point in the history
…e hand.
  • Loading branch information
mchav committed Dec 29, 2024
1 parent 730c03e commit 48e0ee6
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 82 deletions.
3 changes: 3 additions & 0 deletions dataframe.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ library dataframe-lib
Data.DataFrame.Function
build-depends: base >= 4.17.2.0 && < 4.21,
array ^>= 0.5,
bytestring,
containers >= 0.6.7 && < 0.8,
directory >= 1.3.0.0,
statistics >= 0.13,
Expand All @@ -54,6 +55,7 @@ library dataframe-lib-dev
Data.DataFrame.Function
build-depends: base >= 4.17.2.0 && < 4.21,
array ^>= 0.5,
bytestring,
containers >= 0.6.7 && < 0.8,
directory >= 1.3.0.0,
statistics >= 0.13,
Expand All @@ -76,6 +78,7 @@ executable dataframe
Data.DataFrame.Function
build-depends: base >= 4.17.2.0 && < 4.21,
array ^>= 0.5,
bytestring,
containers >= 0.6.7 && < 0.8,
directory >= 1.3.0.0,
statistics >= 0.13,
Expand Down
167 changes: 85 additions & 82 deletions src/Data/DataFrame/IO.hs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ import qualified Data.Text as T
import qualified Data.Text.IO as TIO
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as VU
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Lazy.Char8 as BLC
import qualified Data.Vector.Unboxed.Mutable as VUM
import qualified Data.Vector.Mutable as VM
import qualified Data.Text.Encoding as TE

import Control.Monad (foldM_, forM_, replicateM_, foldM, when, zipWithM_, unless)
import Control.Monad.IO.Class (MonadIO(liftIO))
Expand Down Expand Up @@ -85,7 +89,7 @@ readTsv = readSeparated '\t' defaultOptions
-- | Reads a character separated file into a dataframe using mutable vectors.
readSeparated :: Char -> ReadOptions -> String -> IO DataFrame
readSeparated c opts path = withFile path ReadMode $ \handle -> do
firstRow <- map T.strip . T.split (c ==) <$> TIO.hGetLine handle
firstRow <- T.split (c ==) <$> TIO.hGetLine handle
let columnNames = if hasHeader opts
then map (T.filter (/= '\"')) firstRow
else map (T.singleton . intToDigit) [0..(length firstRow - 1)]
Expand All @@ -94,104 +98,43 @@ readSeparated c opts path = withFile path ReadMode $ \handle -> do

-- Initialize mutable vectors for each column
let numColumns = length columnNames
mutableCols <- VM.replicateM numColumns (VM.new 1024) -- Start with a capacity of 1024 rows per column
totalRows <- countRows path
let actualRows = if hasHeader opts then totalRows - 1 else totalRows
mutableCols <- VM.replicateM numColumns (VM.new actualRows) -- Start with a capacity of 1024 rows per column
rowCounts <- VM.replicate numColumns 0 -- Keeps track of the row count for each column

-- Read rows into the mutable vectors
fillColumns c mutableCols rowCounts handle
fillColumns 0 c mutableCols handle

-- Freeze the mutable vectors into immutable ones
cols <- V.mapM (freezeColumn rowCounts mutableCols) (V.generate numColumns id)
cols <- V.mapM (freezeColumn mutableCols opts) (V.generate numColumns id)
return $ DataFrame {
columns = V.map (mkColumn opts) cols,
columns = cols,
freeIndices = [],
columnIndices = M.fromList (zip columnNames [0..]),
dataframeDimensions = (V.length $ cols V.! 0, V.length $ cols)
dataframeDimensions = (maybe 0 columnLength (cols V.! 0), V.length cols)
}

writeCsv :: String -> DataFrame -> IO ()
writeCsv = writeSeparated ','

writeSeparated :: Char -- ^ Separator
-> String -- ^ Path to write to
-> DataFrame
-> IO ()
writeSeparated c filepath df = withFile filepath WriteMode $ \handle ->do
let (rows, columns) = dataframeDimensions df
let headers = map fst (L.sortBy (compare `on` snd) (M.toList (columnIndices df)))
TIO.hPutStrLn handle (T.intercalate ", " headers)
forM_ [0..(rows - 1)] $ \i -> do
let row = getRowAsText df i
TIO.hPutStrLn handle (T.intercalate ", " row)

getRowAsText :: DataFrame -> Int -> [T.Text]
getRowAsText df i = V.ifoldr go [] (columns df)
where
indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
go k Nothing acc = acc
go k (Just (BoxedColumn (c :: V.Vector a))) acc = case c V.!? i of
Just e -> textRep : acc
where textRep = case testEquality (typeRep @a) (typeRep @T.Text) of
Just Refl -> e
Nothing -> case typeRep @a of
App t1 t2 -> case eqTypeRep t1 (typeRep @Maybe) of
Just HRefl -> case testEquality t2 (typeRep @T.Text) of
Just Refl -> fromMaybe "null" e
Nothing -> (fromOptional . (T.pack . show)) e
where fromOptional s
| T.isPrefixOf "Just " s = T.drop (T.length "Just ") s
| otherwise = "null"
Nothing -> (T.pack . show) e
_ -> (T.pack . show) e
Nothing ->
error $
"Column "
++ T.unpack (indexMap M.! k)
++ " has less items than "
++ "the other columns at index "
++ show i
go k (Just (UnboxedColumn c)) acc = case c VU.!? i of
Just e -> T.pack (show e) : acc
Nothing ->
error $
"Column "
++ T.unpack (indexMap M.! k)
++ " has less items than "
++ "the other columns at index "
++ show i

-- | Reads rows from the handle and stores values in mutable vectors.
fillColumns :: Char -> VM.IOVector (VM.IOVector T.Text) -> VM.IOVector Int -> Handle -> IO ()
fillColumns c mutableCols rowCounts handle = do
fillColumns :: Int -> Char -> VM.IOVector (VM.IOVector T.Text) -> Handle -> IO ()
fillColumns i c mutableCols handle = do
isEOF <- hIsEOF handle
unless isEOF $ do
row <- map T.strip . split c <$> TIO.hGetLine handle
zipWithM_ (writeValue mutableCols rowCounts) [0..] row
fillColumns c mutableCols rowCounts handle
row <- split c <$> TIO.hGetLine handle
zipWithM_ (writeValue mutableCols i) [0..] row
fillColumns (i + 1) c mutableCols handle

-- | Writes a value into the appropriate column, resizing the vector if necessary.
writeValue :: VM.IOVector (VM.IOVector T.Text) -> VM.IOVector Int -> Int -> T.Text -> IO ()
writeValue mutableCols rowCounts colIndex value = do
writeValue :: VM.IOVector (VM.IOVector T.Text) -> Int -> Int -> T.Text -> IO ()
writeValue mutableCols count colIndex value = do
col <- VM.read mutableCols colIndex
count <- VM.read rowCounts colIndex
let capacity = VM.length col
when (count >= capacity) $ do
-- Double the size of the vector if it's full
let newCapacity = capacity * 2
newCol <- VM.grow col newCapacity
VM.write mutableCols colIndex newCol

-- In case we resized we need to get the column again.
col' <- VM.read mutableCols colIndex
VM.write col' count value
VM.write rowCounts colIndex (count + 1)
VM.write col count value

-- | Freezes a mutable vector into an immutable one, trimming it to the actual row count.
freezeColumn :: VM.IOVector Int -> VM.IOVector (VM.IOVector T.Text) -> Int -> IO (V.Vector T.Text)
freezeColumn rowCounts mutableCols colIndex = do
count <- VM.read rowCounts colIndex
freezeColumn :: VM.IOVector (VM.IOVector T.Text) -> ReadOptions -> Int -> IO (Maybe Column)
freezeColumn mutableCols opts colIndex = do
col <- VM.read mutableCols colIndex
V.freeze (VM.slice 0 count col)
mkColumn opts <$> V.freeze col

-- | Constructs a dataframe column, optionally inferring types.
mkColumn :: ReadOptions -> V.Vector T.Text -> Maybe Column
Expand Down Expand Up @@ -233,5 +176,65 @@ hasCommaInQuotes = snd . T.foldl' go (False, False)

removeQuotes :: T.Text -> T.Text
removeQuotes s
| T.null s = s
| otherwise = if T.head s == '\"' && T.last s == '\"' then T.init (T.tail s) else s
| T.null s' = s'
| otherwise = if T.head s' == '\"' && T.last s' == '\"' then T.init (T.tail s') else s'
where s' = T.strip s


-- | First pass to count rows for exact allocation
countRows :: FilePath -> IO Int
countRows path = do
contents <- BL.readFile path
return $ length $ BLC.lines contents


writeCsv :: String -> DataFrame -> IO ()
writeCsv = writeSeparated ','

writeSeparated :: Char -- ^ Separator
-> String -- ^ Path to write to
-> DataFrame
-> IO ()
writeSeparated c filepath df = withFile filepath WriteMode $ \handle ->do
let (rows, columns) = dataframeDimensions df
let headers = map fst (L.sortBy (compare `on` snd) (M.toList (columnIndices df)))
TIO.hPutStrLn handle (T.intercalate ", " headers)
forM_ [0..(rows - 1)] $ \i -> do
let row = getRowAsText df i
TIO.hPutStrLn handle (T.intercalate ", " row)

getRowAsText :: DataFrame -> Int -> [T.Text]
getRowAsText df i = V.ifoldr go [] (columns df)
where
indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
go k Nothing acc = acc
go k (Just (BoxedColumn (c :: V.Vector a))) acc = case c V.!? i of
Just e -> textRep : acc
where textRep = case testEquality (typeRep @a) (typeRep @T.Text) of
Just Refl -> e
Nothing -> case typeRep @a of
App t1 t2 -> case eqTypeRep t1 (typeRep @Maybe) of
Just HRefl -> case testEquality t2 (typeRep @T.Text) of
Just Refl -> fromMaybe "null" e
Nothing -> (fromOptional . (T.pack . show)) e
where fromOptional s
| T.isPrefixOf "Just " s = T.drop (T.length "Just ") s
| otherwise = "null"
Nothing -> (T.pack . show) e
_ -> (T.pack . show) e
Nothing ->
error $
"Column "
++ T.unpack (indexMap M.! k)
++ " has less items than "
++ "the other columns at index "
++ show i
go k (Just (UnboxedColumn c)) acc = case c VU.!? i of
Just e -> T.pack (show e) : acc
Nothing ->
error $
"Column "
++ T.unpack (indexMap M.! k)
++ " has less items than "
++ "the other columns at index "
++ show i

0 comments on commit 48e0ee6

Please sign in to comment.