Search code examples
regexscalaregular-language

Scala: Find all strings of length up to n in regular language


I have a (possibly infinite) regular language which I describe with a regular expression. From this regular language I want to obtain all strings of length up to n, using scala. Some quick googling tells me there are some libraries out there that can help me. Before using an external library I want to know if this is something that is easy (as in something a decent programmer can implement in under 15 minutes) to do myself in Scala. If not, are there some good libraries that you can recommend for this?

To make what I want more concrete. Suppose I have the language A*B* and my n is 3, I then want the strings "", "A", "B", "AA", "AB", "BB", "AAA", "AAB", "ABB", "BBB".


Solution

  • Answer

    Edits

    • 26Nov, 4:30pm - added iterator-based version to reduce runtime and memory consumption. Seq-based version of canonic is at the bottom under (1)
    • 26Nov, 2:45pm - added working seq-based version for canonic, non working old version of canonic is at the bottom (2)

    Approach

    • Canonically generate all words possible for a given alphabet up to length n.
    • Filter the generated words by a regular expression (your regular language in that case)

    Code

    object SO {
    
      import scala.annotation.tailrec
      import scala.collection.{AbstractIterator, Iterator}
      import scala.util.matching.Regex
    
      def canonic(alphabet: Seq[Char], n: Int): Iterator[String] =
        if (n < 0) Iterator.empty
        else {
          val r: IndexedSeq[Iterator[String]] = for (i <- 1 to n)
            yield new CanonicItr(alphabet, i)
          r.reduce(_ ++ _)
        }
    
      private class CanonicItr(alphabet: Seq[Char], width: Int) extends AbstractIterator[String] {
        val aSize = alphabet.size
        val alph = alphabet.toVector
        val total = aSizePower(width)
    
        println("total " + total)
    
        private var pos = 0L
    
        private def aSizePower(r: Int): Long = scala.math.pow(aSize, r).toLong
    
        def stringFor(id: Long): String = {
          val r = for {
            i <- (0 until width).reverse
            // (738 / 10^0) % 10 = 8
            // (738 / 10^1) % 10 = 3
            // (738 / 10^2) % 10 = 7
            charIdx = ((id / (aSizePower(i))) % aSize).toInt
          } yield alph(charIdx)
          r.mkString("")
        }
    
        override def hasNext: Boolean = pos < total
    
        override def next(): String = {
          val s = stringFor(pos)
          pos = pos + 1
          s
        }
      }
    
    
      def main(args: Array[String]): Unit = {
    
        // create all possible words with the given alphabet 
        val canonicWordSet = canonic(Seq('a', 'b', 'c'), 8)
    
        // formal regular language definition
        val languageDef: Regex = "a*b".r
    
        // determine words of language by filtering the canocic set. 
        val wordsOfLanguage = canonicWordSet.filter(word => languageDef.pattern.matcher(word).matches)
    
        println(wordsOfLanguage.toList)
      }
    }
    

    1) Working version of canonic but with high memory requirements

    object SO {
    
      import scala.util.matching.Regex
    
      /**
        * Given a sequence of characters (e.g. Seq('a', 'b', 'c') )
        * generates all combinations up to lneght of n (incl.).
        * 
        * @param alphabet sequence of characters
        * @param n is the max length
        * @return all combinations of up to length n. 
        */
      def canonic(alphabet:Seq[Char], n: Int): Seq[String] = {
        def combination( input: Seq[String], chars: Seq[Char]) = {
          for {
            i <- input
            c <- chars
          } yield (i+c)
        }
    
        @tailrec
        def rec(left: Int, current: Seq[String], accum: Seq[String] ) : Seq[String] = {
          left match {
            case 0 => accum
            case _ => {
              val next = combination( current, alphabet )
              rec( left-1, next, accum ++ next )
            }     
          }
        }
    
        rec(n, Seq(""), Seq(""))
      }
    
      def main(args: Array[String]) : Unit = {
    
        // create all possible words with the given alphabet 
        val canonicWordSet= canonic( Seq('a', 'b', 'c'), 3)
    
        // formal regular language definition
        val languageDef: Regex = "a*b".r
    
        // determine words of language by filtering the canocic set. 
        val wordsOfLanguage = canonicWordSet.filter( word => languageDef.pattern.matcher(word).matches )
    
        println( wordsOfLanguage.toList )
      }
    }
    

    2) Non working version of canonic not working correctly

    def canonic(alphabet:Seq[Char], n: Int): Iterator[String] = {
      for {
        i <- (0 to n).iterator
        combi <- alphabet.combinations(i).map(cs => cs.mkString)
      } yield combi
    }