Search code examples
haskelltypeslinguistics

Representing Types And Occurrences: (so) easy to understand, (so) difficult to code


A brief introduction to the types and occurrences through examples.

Ex1. abbacb

a, b, c are the types.

a occurres 2 times; b occurres 3 times; c occurres 1 times.

This can be represented more concisely as [('a',2),('b',3),('c',1)] (Indeed, the order doesn't matter).

Ex2. abbacb

ab, bb, ba, ac, cb are sequences of types

Each sequence occurs only once.

This can be represented as [("ab",1),("bb",1),("ba",1),("ac",1),("cb",1)]

The following graphical structure has the same informative content of the previous two:

('a',2)          -- 'a'  occurs 2 times
     ('b',1)     -- "ab" occurs 1 times
     ('c',1)     -- "ac" occurs 1 times
('b',2)          -- 'b'  occurs 2 times
     ('a',1)     -- "ba" occurs 1 times
     ('b',1)     -- "bb" occurs 1 times
('c',1)          -- 'c'  occurs 1 times
     ('b',1)     -- "cb" occurs 1 times

In Haskell: [(('a',2),[('b',1),('c',1)]),(('b',2),[('a',1),('b',1)]),(('c',1),[('b',1)])]

For occurrences of sequences of 3 elements:

('a',2)              -- 'a'   occurs 2 times
     ('b',1)         -- "ab"  occurs 1 times
          ('b',1)    -- "abb" occurs 1 times
     ('c',1)         -- "ac"  occurs 1 times
          ('b',1)    -- "acb" occurs 1 times
...

In Haskell:

[
    (('a',2), [(('b',1),[('b',1)]),(('c',1),[('b',1)])]),
    (('b',2), [(('a',1),[('c',1)]),(('b',1),[('a',1)])])
]

with type [((Char, Int), [((Char, Int), [(Char, Int)])])]

Even considering only the sequences of two and three elements, the comprehensibility of the graphical representation is much greater than that in Haskell.

In addition, lists are not very efficient, so I used the Data.Map library and consequently a slightly different representation.

The following examples are based on Pi's digits. Interesting results can be obtained using the words of a novel.

My questions are:

  1. Functions dedicated to the sequences of the three types are very complicated. It is possible to drastically simplify them?

  2. I cannot even imagine how it is possible to generalize the functions for sequences of arbitrary length. Someone has an idea of how it could be done?

  3. Using the following data type recursion should be easier to implement:

    data TuplesTypesOccurences a = L (M.Map a Int) | B  (M.Map a (Int,TuplesTypesOccurences a))
    

    In this way however does not lose access to all of the functions in Data.Map library?

    import qualified Data.Map as M
    import Data.List (sortBy)
    
    piDigits = "31415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679821480865132823066470938446095505822317253594081284811174502841027019385211055596446229489549303819644288109756659334461284756"
    
    type TypesOccurrences a = M.Map a Int
    
    toTypeOccurrences :: Ord k => [k] -> TypesOccurrences k -> TypesOccurrences k  
    toTypeOccurrences [] mp = mp
    toTypeOccurrences (x:xs) mp = toTypeOccurrences xs $ M.insertWith (+) x 1 mp
    -- ex. toTypeOccurrences piDigits M.empty
    
    pprintTO :: Show a => TypesOccurrences a -> IO ()
    pprintTO = mapM_ putStrLn . map (\(xs,n) -> show xs ++ "  " ++ (show n)). sortBy (\x y -> compare (snd y) (snd x)) . M.toList
    -- ex. pprintTO . M.filter (> 22) . toTypeOccurrences piDigits $ M.empty
    
    type Seq2TypeOccurrences a = M.Map a (Int,TypesOccurrences a)
    
    toSQ2TO :: Ord a => [a] -> Seq2TypeOccurrences a -> Seq2TypeOccurrences a
    toSQ2TO []       mp = mp
    toSQ2TO [x]      mp = mp
    toSQ2TO (x:y:xs) mp = toSQ2TO (y:xs) $
      case M.lookup x mp of
        Nothing       -> M.insert x (1,M.singleton y 1) mp
        Just (_,mp2) -> case M.lookup y mp2 of
            Nothing -> M.update (\(n,mp2) -> Just (n+1,M.insert y 1 mp2)) x mp
            Just _  -> M.update (\(n,mp2) -> Just (n+1,M.update (\m -> Just (m+1)) y mp2)) x mp
    -- ex. toSQ2TO piDigits M.empty
    
    pprintSQ2TO :: Show a => Seq2TypeOccurrences a -> IO ()
    pprintSQ2TO = mapM_ putStrLn . map (\(x,(n,mp)) -> "(" ++ (show x) ++ "," ++ (show n) ++ ")\n\t" ++ (drop 2 . concatMap (("\n\t" ++) . show) . M.toList $ mp)) . M.toList
    -- ex. pprintSQ2TO (toSQ2TO piDigits  M.empty)
    
    greaterThanSQ2TO :: Ord a => Int -> Seq2TypeOccurrences a -> Seq2TypeOccurrences a
    greaterThanSQ2TO n =   M.filter (\(_,mp2) -> not . M.null $ mp2) . M.map (\(o,mp2) -> (o,M.filter (> n) mp2)) . M.filter (\(m,mp) -> m > n)
    -- ex. pprintSQ2TO . greaterThanSQ2TO 4 . toSQ2TO piDigits $ M.empty
    
    descSortSQ2TO :: Ord a => Seq2TypeOccurrences a -> [([a], Int)]
    descSortSQ2TO = sortBy (\xs ys -> compare (snd ys) (snd xs)) . concatMap (\(x,ys) -> zipWith (\x (y,n) -> ([x,y],n)) (repeat x) ys ) . map (\(x,(_,mp2)) -> (x,M.toList mp2)) . M.toList
    -- mapM_ print . descSortSQ2TO . greaterThanSQ2TO 4 . toSQ2TO piDigits $ M.empty
    
    unionSQ2TO :: Ord a => Seq2TypeOccurrences a -> Seq2TypeOccurrences a -> Seq2TypeOccurrences a
    unionSQ2TO = M.unionWith (\(n1,mp1) (n2,mp2) -> (n1+n2, M.unionWith (+) mp1 mp2))
    
    type Seq3TypeOccurrences a = M.Map a (Int,Seq2TypeOccurrences a)
    
    toSQ3TO :: Ord k => [k] -> Seq3TypeOccurrences k -> Seq3TypeOccurrences k
    toSQ3TO []    mp = mp
    toSQ3TO [x]   mp = mp
    toSQ3TO [x,y] mp = mp
    toSQ3TO (x:y:z:xs) mp = toSQ3TO (y:z:xs) $
      case M.lookup x mp of
        Nothing -> M.insert x (1,M.singleton y (1,M.singleton z 1)) mp
        Just (_,mp2) -> case M.lookup y mp2 of
           Nothing -> M.update (\(n,mp2) -> Just (n+1,M.insert y (1,M.singleton z 1) mp2)) x mp
           Just (m,kns3)  -> case M.lookup z kns3 of
                Nothing -> M.update (\(n,_) -> Just (n+1,M.update (\(m,mp3) -> Just (m+1,M.insert z 1    mp3)) y mp2)) x mp
                Just _  -> M.update (\(n,_) -> Just (n+1,M.update (\(m,mp3) -> Just (m+1,M.update (Just . (+1)) z mp3)) y mp2)) x mp
    -- ex. toSQ3TO piDigits M.empty
    
    pprint3 :: Show a => Seq3TypeOccurrences a -> IO ()
    pprint3 = mapM_ putStrLn . map (\(x,(n,mp)) -> "(" ++ (show x) ++ "," ++ (show n) ++ ")" ++ (concatMap (\(x2,(n2,mp2)) -> "\n\t(" ++ (show x2) ++ "," ++ (show n2) ++ ")" ++ (f mp2)) . M.toList $ mp)) . M.toList
      where
      f = concatMap (\(x,n) -> "\n\t\t(" ++ (show x) ++ "," ++ (show n) ++ ")") . M.toList
    -- pprint3 . toSQ3TO piDigits $ M.empty  
    
    pprint3B :: Show a => Seq3TypeOccurrences a -> IO ()
    pprint3B = mapM_ putStrLn . map (\(xs,n) -> show xs ++ "  " ++ (show n)) . concatMap (\(xs,mp) -> zipWith (\ys (z,n) -> (ys ++ [z],n)) (repeat xs) mp) . concatMap (\(x,mp) -> zipWith (\y (z,mp2) -> ([y,z],mp2)) (repeat x) mp) . map (\(x,(_,mp)) -> (x, map (\(y,(_,mp2)) -> (y, M.toList mp2)) $ M.toList mp)) . M.toList
    -- pprint3B . toSQ3TO piDigits $ M.empty 
    
    greaterThan3Q2TO :: Ord a => Int -> Seq3TypeOccurrences a ->  Seq3TypeOccurrences a
    greaterThan3Q2TO n = M.filter (\(_,mp) -> not . M.null $ mp)
      . M.map (\(m,mp) -> (m,M.filter (\(o,mp2) -> not . M.null $ mp2) mp))
      . M.map (\(m,mp) -> (m,M.map (\(o,mp2) -> (o,M.filter (>n) mp2)) mp))
      . M.filter (\(_,mp) -> not. M.null $ mp)
      . M.map (\(m,mp) -> (m,M.filter ((n <) . fst) mp))
      . M.filter (\(m,mp) -> m > n)
    -- ex. pprint3B . greaterThan3Q2TO 2 . toSQ3TO piDigits $ M.empty
    
    unionSQ3TO :: Ord a => Seq3TypeOccurrences a -> Seq3TypeOccurrences a -> Seq3TypeOccurrences a
    unionSQ3TO = M.unionWith (\(n,mp2a) (m,mp2b) -> (n+m,unionSQ2TO mp2a mp2b))
    

Solution

  • You need to define a recursive data structure like this:

    data Trie = Nil | Trie (Map Char (Int, Trie))
    

    This allows the show and add functions to be defined recursively.

    Here is an implementation. Run test3 to see an example of how it works.

    import qualified Data.Map as M
    import Text.PrettyPrint
    import Data.List
    
    data Trie = Nil | Trie (M.Map Char (Int, Trie))
    
    showTrie :: String -> Trie -> Doc
    showTrie _ Nil = empty
    showTrie prefix (Trie m) =
      vcat $ 
        do (k,(count,t)) <- M.assocs m
           let prefix' = prefix ++ [k]
           return $
             vcat [ lparen <> char '"' <> text prefix' <> char '"' <> comma <> int count <> rparen
                  , nest 4 (showTrie prefix' t)
                  ]
    
    -- add an element to a Trie
    addTrie :: Trie -> String -> Trie
    addTrie t [] = t
    addTrie Nil xs = addTrie (Trie M.empty) xs
    addTrie (Trie m) (x:xs) = 
      case M.lookup x m of
        Nothing     -> let t' = addTrie Nil xs
                       in Trie $ M.insert x (1,t') m
        Just (c,t)  -> let t' = addTrie t xs
                       in Trie $ M.insert x (c+1,t') m
    
    test1 = 
      let t1 = addTrie Nil "abcd"
          t2 = addTrie t1  "abce"
      in putStrLn $ render $ showTrie "" t2
    
    test2 n str =
      putStrLn $ render $ showTrie "" $
          foldr (flip addTrie) Nil (map (take n) (tails str))
    
    test3 = test2 4 "31415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679821480865132823066470938446095505822317253594081284811174502841027019385211055596446229489549303819644288109756659334461284756"