Search code examples
haskellbytestring

Switching to ByteStrings


EDIT: I followed Yuras and Dave4420's advices (Thanks). I still have some errors. Updated the question. Finally I will use meiersi's version (Thanks) but I still want to find my errors...

I have a simple script that goes like this:

import System.Environment

getRow :: Int -> String -> String
getRow n = (!!n) . lines

getField :: Int -> String -> String
getField n = (!!n) . words'

words' :: String -> [String]
words' str = case str of
                        [] -> []
                        _ -> (takeHead " ; " str) : (words' (takeTail " ; " str))

takeHead :: String -> String -> String
takeHead st1 st2 = case st2 of
                                [] -> []
                                _ -> if st1 == (nHead (length st1) st2) then [] else (head st2):(takeHead st1 (tail st2))

takeTail :: String -> String -> String
takeTail st1 st2 = case st2 of
                                [] -> []
                                _ -> if st1 == (nHead (length st1) st2) then nTail (length st1) st2 else takeTail st1 (tail st2)

nTail :: Int -> String -> String
nTail n str = let rec n str = if n == 0 then str else rec (n - 1) (tail str)
              in if (length str) < n then str else rec n str

nHead :: Int -> String -> String
nHead n str = let rec n str = if n == 0 then [] else (head str):(rec (n - 1) (tail str))
              in if (length str) < n then str else rec n str

getValue :: String -> String -> String -> String
getValue row field src = getField (read field) $ getRow (read row) src

main :: IO ()
main = do
    args <- getArgs
    case args of
        (path: opt1: opt2: _) -> do
            src <- readFile path
            putStrLn $ getValue opt1 opt2 src
        (path: _) -> do
            src <- readFile path
            putStrLn $ show $ length $ lines src

It compiles and works. Then I wanted to switch to ByteStrings. Here is my attempt:

import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Char8 as Bc (cons, empty,unpack)
import qualified Data.ByteString.Lazy.UTF8 as Bu (lines)
import qualified System.Posix.Env.ByteString as Bg (getArgs)

separator :: B.ByteString
separator = (Bc.cons ' ' (Bc.cons ';' (Bc.cons ' ' Bc.empty)))

getRow :: Int -> B.ByteString -> B.ByteString
getRow n = (`B.index` n) $ Bu.lines

getCol :: Int -> B.ByteString -> B.ByteString
getCol n = (`B.index` n) $ wordsWithSeparator

wordsWithSeparator :: B.ByteString -> [B.ByteString]
wordsWithSeparator str = if B.null str then [] else (takeHead separator str):(wordsWithSeparator (takeTail separator str))

takeHead :: B.ByteString -> B.ByteString -> B.ByteString
takeHead st1 st2 = if B.null st2 then B.empty else if st1 == (nHead (toInteger (B.length st1)) st2) then B.empty else B.cons (B.head st2) (takeHead st1 (B.tail st2))

takeTail :: B.ByteString -> B.ByteString -> B.ByteString
takeTail st1 st2 = if B.null st2 then B.empty else if st1 == (nHead (toInteger (B.length st1)) st2) then nTail (toInteger (B.length st1)) st2 else takeTail st1 (B.tail st2)

nTail :: Integer -> B.ByteString -> B.ByteString
nTail n str = let rec n str = if n == 0 then str else rec (n - 1) (B.tail str)
              in if (toInteger (B.length str)) < n then str else rec n str

nHead :: Integer -> B.ByteString -> B.ByteString
nHead n str = let rec n str = if n == 0 then B.empty else B.cons (B.head str)(rec (n - 1) (B.tail str))
              in if (toInteger (B.length str)) < n then str else rec n str

getValue :: B.ByteString -> B.ByteString -> B.ByteString -> B.ByteString
getValue row field = getCol (read (Bc.unpack field)) . getRow (read (Bc.unpack row))

main = do args <- Bg.getArgs
          case (map (B.fromChunks . return) args) of
                                                    (path:opt1:opt2:_) -> do src <- B.readFile (Bc.unpack path)
                                                                             B.putStrLn $ getValue opt1 opt2 src

                                                    (path:_)           -> do src <- B.readFile (Bc.unpack path)
                                                                             putStrLn $ show $ length $ Bu.lines src

It doesn't work. I could not debug it. Here is what GHC tells me:

BETA_getlow2.hs:10:23:
    Couldn't match expected type `GHC.Int.Int64' with actual type `Int'
    In the second argument of `B.index', namely `n'
    In the expression: (`B.index` n)
    In the expression: (`B.index` n) $ Bu.lines

BETA_getlow2.hs:13:23:
    Couldn't match expected type `GHC.Int.Int64' with actual type `Int'
    In the second argument of `B.index', namely `n'
    In the expression: (`B.index` n)
    In the expression: (`B.index` n) $ wordsWithSeparator

Any tips would be appreciated.


Solution

  • I'm taking the liberty to interpret the following two sub-questions into your original question.

    1. What Haskell code would one typically write for a script like the one you posted.
    2. What are the right data structures to efficiently perform the desired functionality.

    The following code gives one answer to these two sub-questions. It uses the text library to represent sequences of Unicode characters. Moreover, it exploits the text library's high-level API to implement the desired functionality. This makes the code easier to grasp and thereby avoids potential mistakes in the implementation of low-level functions.

    {-# LANGUAGE OverloadedStrings #-}
    
    import qualified Data.Text    as T
    import qualified Data.Text.IO as T
    
    import System.Environment (getArgs)
    
    type Table a = [[a]]
    
    -- | Split a text value into a text table.
    toTable :: T.Text -> Table T.Text
    toTable = map (T.splitOn " ; ") . T.lines
    
    -- | Retrieve a cell from a table.
    cell :: Int -> Int -> Table a -> a
    cell row col = (!! col) . (!! row)
    
    main :: IO ()
    main = do
        (path:rest) <- getArgs
        src <- T.readFile path
        case rest of
            row : col : _ -> T.putStrLn $ cell (read row) (read col) $ toTable src
            _             -> putStrLn $ show $ length $ T.lines src