Copies are the devil. Remove grow logic and calculate num lines befor…

…e hand.
mchav · Dec 29, 2024 · 48e0ee6 · 48e0ee6
1 parent 730c03e
commit 48e0ee6
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 82 deletions.
diff --git a/dataframe.cabal b/dataframe.cabal
@@ -33,6 +33,7 @@ library dataframe-lib
                    Data.DataFrame.Function
     build-depends:    base >= 4.17.2.0 && < 4.21,
                       array ^>= 0.5,
+                      bytestring,
                       containers >= 0.6.7 && < 0.8,
                       directory >= 1.3.0.0,
                       statistics >= 0.13,
@@ -54,6 +55,7 @@ library dataframe-lib-dev
                      Data.DataFrame.Function
     build-depends:    base >= 4.17.2.0 && < 4.21,
                       array ^>= 0.5,
+                      bytestring,
                       containers >= 0.6.7 && < 0.8,
                       directory >= 1.3.0.0,
                       statistics >= 0.13,
@@ -76,6 +78,7 @@ executable dataframe
                    Data.DataFrame.Function
     build-depends:    base >= 4.17.2.0 && < 4.21,
                       array ^>= 0.5,
+                      bytestring,
                       containers >= 0.6.7 && < 0.8,
                       directory >= 1.3.0.0,
                       statistics >= 0.13,

diff --git a/src/Data/DataFrame/IO.hs b/src/Data/DataFrame/IO.hs
@@ -31,7 +31,11 @@ import qualified Data.Text as T
 import qualified Data.Text.IO as TIO
 import qualified Data.Vector as V
 import qualified Data.Vector.Unboxed as VU
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.ByteString.Lazy.Char8 as BLC
+import qualified Data.Vector.Unboxed.Mutable as VUM
 import qualified Data.Vector.Mutable as VM
+import qualified Data.Text.Encoding as TE
 
 import Control.Monad (foldM_, forM_, replicateM_, foldM, when, zipWithM_, unless)
 import Control.Monad.IO.Class (MonadIO(liftIO))
@@ -85,7 +89,7 @@ readTsv = readSeparated '\t' defaultOptions
 -- | Reads a character separated file into a dataframe using mutable vectors.
 readSeparated :: Char -> ReadOptions -> String -> IO DataFrame
 readSeparated c opts path = withFile path ReadMode $ \handle -> do
-    firstRow <- map T.strip . T.split (c ==) <$> TIO.hGetLine handle
+    firstRow <- T.split (c ==) <$> TIO.hGetLine handle
     let columnNames = if hasHeader opts
                       then map (T.filter (/= '\"')) firstRow
                       else map (T.singleton . intToDigit) [0..(length firstRow - 1)]
@@ -94,104 +98,43 @@ readSeparated c opts path = withFile path ReadMode $ \handle -> do
 
     -- Initialize mutable vectors for each column
     let numColumns = length columnNames
-    mutableCols <- VM.replicateM numColumns (VM.new 1024)  -- Start with a capacity of 1024 rows per column
+    totalRows <- countRows path
+    let actualRows = if hasHeader opts then totalRows - 1 else totalRows
+    mutableCols <- VM.replicateM numColumns (VM.new actualRows)  -- Start with a capacity of 1024 rows per column
     rowCounts <- VM.replicate numColumns 0  -- Keeps track of the row count for each column
 
     -- Read rows into the mutable vectors
-    fillColumns c mutableCols rowCounts handle
+    fillColumns 0 c mutableCols handle
 
     -- Freeze the mutable vectors into immutable ones
-    cols <- V.mapM (freezeColumn rowCounts mutableCols) (V.generate numColumns id)
+    cols <- V.mapM (freezeColumn mutableCols opts) (V.generate numColumns id)
     return $ DataFrame {
-            columns = V.map (mkColumn opts) cols,
+            columns = cols,
             freeIndices = [],
             columnIndices = M.fromList (zip columnNames [0..]),
-            dataframeDimensions = (V.length $ cols V.! 0, V.length $ cols)
+            dataframeDimensions = (maybe 0 columnLength (cols V.! 0), V.length cols)
         }
 
-writeCsv :: String -> DataFrame -> IO ()
-writeCsv = writeSeparated ','
-
-writeSeparated :: Char      -- ^ Separator
-               -> String    -- ^ Path to write to
-               -> DataFrame
-               -> IO ()
-writeSeparated c filepath df = withFile filepath WriteMode $ \handle ->do
-    let (rows, columns) = dataframeDimensions df
-    let headers = map fst (L.sortBy (compare `on` snd) (M.toList (columnIndices df)))
-    TIO.hPutStrLn handle (T.intercalate ", " headers)
-    forM_ [0..(rows - 1)] $ \i -> do
-        let row = getRowAsText df i
-        TIO.hPutStrLn handle (T.intercalate ", " row)
-
-getRowAsText :: DataFrame -> Int -> [T.Text]
-getRowAsText df i = V.ifoldr go [] (columns df)
-  where
-    indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
-    go k Nothing acc = acc
-    go k (Just (BoxedColumn (c :: V.Vector a))) acc = case c V.!? i of
-        Just e -> textRep : acc
-            where textRep = case testEquality (typeRep @a) (typeRep @T.Text) of
-                    Just Refl -> e
-                    Nothing   -> case typeRep @a of
-                        App t1 t2 -> case eqTypeRep t1 (typeRep @Maybe) of
-                            Just HRefl -> case testEquality t2 (typeRep @T.Text) of
-                                Just Refl -> fromMaybe "null" e
-                                Nothing -> (fromOptional . (T.pack . show)) e
-                                            where fromOptional s
-                                                    | T.isPrefixOf "Just " s = T.drop (T.length "Just ") s
-                                                    | otherwise = "null"
-                            Nothing -> (T.pack . show) e
-                        _ -> (T.pack . show) e
-        Nothing ->
-            error $
-                "Column "
-                ++ T.unpack (indexMap M.! k)
-                ++ " has less items than "
-                ++ "the other columns at index "
-                ++ show i
-    go k (Just (UnboxedColumn c)) acc = case c VU.!? i of
-        Just e -> T.pack (show e) : acc
-        Nothing ->
-            error $
-                "Column "
-                ++ T.unpack (indexMap M.! k)
-                ++ " has less items than "
-                ++ "the other columns at index "
-                ++ show i
-
 -- | Reads rows from the handle and stores values in mutable vectors.
-fillColumns :: Char -> VM.IOVector (VM.IOVector T.Text) -> VM.IOVector Int -> Handle -> IO ()
-fillColumns c mutableCols rowCounts handle = do
+fillColumns :: Int -> Char -> VM.IOVector (VM.IOVector T.Text) -> Handle -> IO ()
+fillColumns i c mutableCols handle = do
     isEOF <- hIsEOF handle
     unless isEOF $ do
-        row <- map T.strip . split c <$> TIO.hGetLine handle
-        zipWithM_ (writeValue mutableCols rowCounts) [0..] row
-        fillColumns c mutableCols rowCounts handle
+        row <- split c <$> TIO.hGetLine handle
+        zipWithM_ (writeValue mutableCols i) [0..] row
+        fillColumns (i + 1) c mutableCols handle
 
 -- | Writes a value into the appropriate column, resizing the vector if necessary.
-writeValue :: VM.IOVector (VM.IOVector T.Text) -> VM.IOVector Int -> Int -> T.Text -> IO ()
-writeValue mutableCols rowCounts colIndex value = do
+writeValue :: VM.IOVector (VM.IOVector T.Text) -> Int -> Int -> T.Text -> IO ()
+writeValue mutableCols count colIndex value = do
     col <- VM.read mutableCols colIndex
-    count <- VM.read rowCounts colIndex
-    let capacity = VM.length col
-    when (count >= capacity) $ do
-        -- Double the size of the vector if it's full
-        let newCapacity = capacity * 2
-        newCol <- VM.grow col newCapacity
-        VM.write mutableCols colIndex newCol
-
-    -- In case we resized we need to get the column again.
-    col' <- VM.read mutableCols colIndex
-    VM.write col' count value
-    VM.write rowCounts colIndex (count + 1)
+    VM.write col count value
 
 -- | Freezes a mutable vector into an immutable one, trimming it to the actual row count.
-freezeColumn :: VM.IOVector Int -> VM.IOVector (VM.IOVector T.Text) -> Int -> IO (V.Vector T.Text)
-freezeColumn rowCounts mutableCols colIndex = do
-    count <- VM.read rowCounts colIndex
+freezeColumn :: VM.IOVector (VM.IOVector T.Text) -> ReadOptions -> Int -> IO (Maybe Column)
+freezeColumn mutableCols opts colIndex = do
     col <- VM.read mutableCols colIndex
-    V.freeze (VM.slice 0 count col)
+    mkColumn opts <$> V.freeze col
 
 -- | Constructs a dataframe column, optionally inferring types.
 mkColumn :: ReadOptions -> V.Vector T.Text -> Maybe Column
@@ -233,5 +176,65 @@ hasCommaInQuotes = snd . T.foldl' go (False, False)
 
 removeQuotes :: T.Text -> T.Text
 removeQuotes s
-    | T.null s = s
-    | otherwise = if T.head s == '\"' && T.last s == '\"' then T.init (T.tail s) else s
+    | T.null s' = s'
+    | otherwise = if T.head s' == '\"' && T.last s' == '\"' then T.init (T.tail s') else s'
+        where s' = T.strip s
+
+
+-- | First pass to count rows for exact allocation
+countRows :: FilePath -> IO Int
+countRows path = do
+    contents <- BL.readFile path
+    return $ length $ BLC.lines contents
+
+
+writeCsv :: String -> DataFrame -> IO ()
+writeCsv = writeSeparated ','
+
+writeSeparated :: Char      -- ^ Separator
+               -> String    -- ^ Path to write to
+               -> DataFrame
+               -> IO ()
+writeSeparated c filepath df = withFile filepath WriteMode $ \handle ->do
+    let (rows, columns) = dataframeDimensions df
+    let headers = map fst (L.sortBy (compare `on` snd) (M.toList (columnIndices df)))
+    TIO.hPutStrLn handle (T.intercalate ", " headers)
+    forM_ [0..(rows - 1)] $ \i -> do
+        let row = getRowAsText df i
+        TIO.hPutStrLn handle (T.intercalate ", " row)
+
+getRowAsText :: DataFrame -> Int -> [T.Text]
+getRowAsText df i = V.ifoldr go [] (columns df)
+  where
+    indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
+    go k Nothing acc = acc
+    go k (Just (BoxedColumn (c :: V.Vector a))) acc = case c V.!? i of
+        Just e -> textRep : acc
+            where textRep = case testEquality (typeRep @a) (typeRep @T.Text) of
+                    Just Refl -> e
+                    Nothing   -> case typeRep @a of
+                        App t1 t2 -> case eqTypeRep t1 (typeRep @Maybe) of
+                            Just HRefl -> case testEquality t2 (typeRep @T.Text) of
+                                Just Refl -> fromMaybe "null" e
+                                Nothing -> (fromOptional . (T.pack . show)) e
+                                            where fromOptional s
+                                                    | T.isPrefixOf "Just " s = T.drop (T.length "Just ") s
+                                                    | otherwise = "null"
+                            Nothing -> (T.pack . show) e
+                        _ -> (T.pack . show) e
+        Nothing ->
+            error $
+                "Column "
+                ++ T.unpack (indexMap M.! k)
+                ++ " has less items than "
+                ++ "the other columns at index "
+                ++ show i
+    go k (Just (UnboxedColumn c)) acc = case c VU.!? i of
+        Just e -> T.pack (show e) : acc
+        Nothing ->
+            error $
+                "Column "
+                ++ T.unpack (indexMap M.! k)
+                ++ " has less items than "
+                ++ "the other columns at index "
+                ++ show i