Search code examples
algorithmscalarle

Group adjacent similar (shortest prefix) elements in string or list in Scala


am facing dramatic performances issues for a kind of classic problem to solve.

i have as inputs a string like :

input :
  val test: String = "1,1,12,1,1,12,13"
  val test2: String = "1,1,12,1,1,12,13,1,1,12"

Desired output:

test --> 2{1,1,12},13

test2 --> 2{1,1,12},13,2{1},12

I have took the following approach, by comparing suffix, and prefix of that given string in a recursive fashion

  def shorten_prefix(pfx:String, sfx: String): (Integer, String, String) = {
    var count = 1
    var _sfx = sfx
    while (_sfx.startsWith(pfx)) {
      count = count + 1
      _sfx = _sfx.splitAt(pfx.length)._2
    }
    val _tmp =  s"$count{${pfx}}"
    (count, _tmp, _sfx)
  }
  def find_shortest_repr(input: String): String= {
    var possible_variants: mutable.ListBuffer[String] = new mutable.ListBuffer[String]()
    if (input.isEmpty) {
      return ""
    }

        val size = input.length
        for (i <- 1 until  size + 1){
          val (prefixes, suffixes) = input.splitAt(i)
          val (counter, shortened, new_ending) = shorten_prefix(prefixes, suffixes)

          val shortest_ending: String = find_shortest_repr(new_ending)
          val tmp = shortened ++ shortest_ending
          possible_variants += (tmp)
        }
        possible_variants.minBy(f => f.length)
      }


   def compress(x: String)= {
     val _tmp = find_shortest_repr(x)
     val regex = "(,\\})"
     val subst = "},"
     _tmp.replaceAll(regex, subst).dropRight(1)
   }

  println(compress(test))
  println(compress(test2))

}

It sounds that the longer is the string, longer is the calculation of all possible permutations, to find the best match.

Is there any tricks or advice, am missing ?

Thanks you


Solution

  • Here's some code. It is functional and tail recursive, but no idea of performance or whether there are better algorithms!

    // Count number of times first element repeats at start of list
    // Returns repeat count
    def prefixCount[T](s: List[T]): Int = {
      def loop(first: T, rem: List[T], matchCount: Int): Int =
        rem match {
          case hd :: tail if hd == first =>
            loop(first, tail, matchCount + 1)
          case _ =>
            matchCount
        }
    
      s match {
        case hd :: tail => loop(hd, tail, 1)
        case _ => s.size
      }
    }
    
    // Find the largest repeating group at start of sequence
    // Returns (groupSize, groupCount)
    def prefixGroup[T](seq: Seq[T]): (Int, Int) = {
      def loop(len: Int): (Int, Int) =
        if (len == 0) {
          (1, 1)
        } else {
          val g = seq.grouped(len).toList
          val c = prefixCount(g)
          if (c > 1) {
            (len, c)
          } else {
            loop(len-1)
          }
        }
    
      seq.size match {
        case 0 => (0,1)
        case n => loop(n/2)
      }
    }
    
    // Find all prefix sequences
    def prefixAll[T](v: Seq[T]): List[(Seq[T], Int)] = {
      def loop(rem: Seq[T], res: List[(Seq[T], Int)]): List[(Seq[T], Int)] =
        rem match {
          case Nil =>
            res.reverse
          case hd :: Nil =>
            ((rem, 1) +: res).reverse
          case _ =>
            val (groupSize, groupCount) = prefixGroup(rem)
    
            loop(rem.drop(groupSize*groupCount), (rem.take(groupSize), groupCount) +: res)
        }
    
      loop(v, Nil)
    }
    
    def prefixToString[T](p: List[(Seq[T], Int)]): String =
      p.map {
        case (s, 1) => s.mkString(",")
        case (s, l) => l.toString + "{" + s.mkString(",") + "}"
      }.mkString(",")
    
    def test(s: String) = {
      val v = s.split(",").toVector
      val a = prefixAll(v)
    
      print(s"$s => ")
      println(prefixToString(a))
    }
    
    val tests = List(
      "",
      "1",
      "1,2,3,4,5,6",
      "1,1,12,1,1,12,13",
      "1,1,12,1,1,12,13,1,1,12",
      "2,1,2,2,1,2,1,2,2",
    )
    
    tests.foreach(test)
    

    Output is:

      => 
    1 => 1
    1,2,3,4,5,6 => 1,2,3,4,5,6
    1,1,12,1,1,12,13 => 2{1,1,12},13
    1,1,12,1,1,12,13,1,1,12 => 2{1,1,12},13,2{1},12
    2,1,2,2,1,2,1,2,2 => 2{2,1,2},1,2{2}