Search code examples
clojure

How to properly split a string into n number of pieces in clojure?


I'm attempting to turn a string into a collection of size N. My approach returns the wrong results.


(defn chop
  [s pieces]
  (let [piece-size (int (/ (count s) pieces))]
    (map #(str/join %) (partition-all piece-size s))))

(def big-str (slurp "/path/to/9065-byte-string.txt"))

(count (chop (str/join (take 100 (repeat "x"))) 100))
100

(count (chop (str/join (take 10005 (repeat "x"))) 100))
101

so with my test string that I'm trying to split into 100 pieces, I actually sometimes get 101 pieces (if pieze-size isn't an even multiple of 100). Not sure what is going on. Maybe my math is wrong on piece-size.

It works if I pad the string, but I don't want to do that.

(defn chop
  [s pieces]
  (let [pad-size (- 100 (mod (count s) pieces))
        padded (str s (str/join (take pad-size (repeat " "))))
        piece-size (int (/ (count padded) pieces))
        ]
    (println "pad-size=" pad-size)
    (println "piece-size=" piece-size)
    (map #(str/join %) (partition-all piece-size padded))))

Solution

  • Your math is wrong.

    Suppose you have N items, and you want g groups. If N/g isn't an integer, you have groups with different numbers of items. If you want to spread out the difference, define like so:

    (ns tst.demo.core
      (:use tupelo.core tupelo.test)
      (:require
        [tupelo.string :as str]))
    
    (defn chop
      [s groups]
      (newline)
      (println :-----------------------------------------------------------------------------)
      (let-spy
        [N      (count s)
         r      (/ (float N) (float groups))  ; or use `quot`
         a      (int (Math/floor r))  ; size of "small" groups
         b      (inc a)               ; size of "big"   groups
    
         ; Solve 2 eq's in 2 unkowns
         ; xa + yb = N
         ; x  + y  = g
         x      (- (* b groups) N)   ; number of "small" groups
         y      (- groups x)         ; number of "big" groups
         N1     (* x a)  ; chars in all small groups
         N2     (* y b)  ; chars in all big groups
         >>     (assert (= N (+ N1 N2)))   ; verify calculated correctly
         chars  (vec s)
         smalls (vec (partition a (take N1 chars)))  ; or use `split-at`
         bigs   (vec (partition b (drop N1 chars)))
    
         result (mapv str/join
                  (concat smalls bigs))
         ]
        result))
    

    with unit tests:

    (dotest
      (is= (chop "abcd" 2) ["ab" "cd"])
      (is= (chop "abcd" 3) ["a" "b" "cd"])
      (is= (chop "abcde" 3) ["a" "bc" "de"])
      (is= (chop "abcdef" 3) ["ab" "cd" "ef"])
      (is= (chop "abcdefg" 3) ["ab" "cd" "efg"])
    
      (let [s100 (str/join (take 100 (repeat "x")))
            s105 (str/join (take 105 (repeat "x")))
    
            r100 (chop s100 10)
            r105 (chop s105 10)
            ]
        (is= 10 (spyx (count r100)))
        (is= 10 (spyx (count r105)))))
    

    with results printed like:

    :-----------------------------------------------------------------------------
    N => 4
    r => 2.0
    a => 2
    b => 3
    x => 2
    y => 0
    N1 => 4
    N2 => 0
    >> => nil
    chars => [\a \b \c \d]
    smalls => [(\a \b) (\c \d)]
    bigs => []
    result => ["ab" "cd"]
    
    :-----------------------------------------------------------------------------
    N => 4
    r => 1.3333333333333333
    a => 1
    b => 2
    x => 2
    y => 1
    N1 => 2
    N2 => 2
    >> => nil
    chars => [\a \b \c \d]
    smalls => [(\a) (\b)]
    bigs => [(\c \d)]
    result => ["a" "b" "cd"]
    
    :-----------------------------------------------------------------------------
    N => 5
    r => 1.6666666666666667
    a => 1
    b => 2
    x => 1
    y => 2
    N1 => 1
    N2 => 4
    >> => nil
    chars => [\a \b \c \d \e]
    smalls => [(\a)]
    bigs => [(\b \c) (\d \e)]
    result => ["a" "bc" "de"]
    
    :-----------------------------------------------------------------------------
    N => 6
    r => 2.0
    a => 2
    b => 3
    x => 3
    y => 0
    N1 => 6
    N2 => 0
    >> => nil
    chars => [\a \b \c \d \e \f]
    smalls => [(\a \b) (\c \d) (\e \f)]
    bigs => []
    result => ["ab" "cd" "ef"]
    
    :-----------------------------------------------------------------------------
    N => 7
    r => 2.3333333333333335
    a => 2
    b => 3
    x => 2
    y => 1
    N1 => 4
    N2 => 3
    >> => nil
    chars => [\a \b \c \d \e \f \g]
    smalls => [(\a \b) (\c \d)]
    bigs => [(\e \f \g)]
    result => ["ab" "cd" "efg"]
    

    Once you have it debugged and you understand the steps, change let-spy to let and remove the other print statements.

    The above is made using my favorite template project.


    Update

    If you don't like solving systems of equations, you could just use quot and either mod or remainder to figure out the division:

    (defn chop
      [s groups]
      (let [N            (count s)
            nsmall       (quot N groups) ; size of "small" groups
            nbig         (inc nsmall) ; size of "big"   groups
            ngrp-big     (- N (* nsmall groups)) ; number of "big" groups
            ngrp-small   (- groups ngrp-big) ; number of "small" groups
            nsmall-chars (* ngrp-small nsmall) ; chars in all small groups
            [chars-small chars-large] (split-at nsmall-chars s)
            smalls       (partition nsmall chars-small)
            bigs         (partition nbig chars-large)
            result       (mapv str/join
                           (concat smalls bigs))]
        result))