I have a (possibly infinite) regular language which I describe with a regular expression. From this regular language I want to obtain all strings of length up to n, using scala. Some quick googling tells me there are some libraries out there that can help me. Before using an external library I want to know if this is something that is easy (as in something a decent programmer can implement in under 15 minutes) to do myself in Scala. If not, are there some good libraries that you can recommend for this?
To make what I want more concrete. Suppose I have the language A*B*
and my n
is 3, I then want the strings "", "A", "B", "AA", "AB", "BB", "AAA", "AAB", "ABB", "BBB"
.
n
.object SO {
import scala.annotation.tailrec
import scala.collection.{AbstractIterator, Iterator}
import scala.util.matching.Regex
def canonic(alphabet: Seq[Char], n: Int): Iterator[String] =
if (n < 0) Iterator.empty
else {
val r: IndexedSeq[Iterator[String]] = for (i <- 1 to n)
yield new CanonicItr(alphabet, i)
r.reduce(_ ++ _)
}
private class CanonicItr(alphabet: Seq[Char], width: Int) extends AbstractIterator[String] {
val aSize = alphabet.size
val alph = alphabet.toVector
val total = aSizePower(width)
println("total " + total)
private var pos = 0L
private def aSizePower(r: Int): Long = scala.math.pow(aSize, r).toLong
def stringFor(id: Long): String = {
val r = for {
i <- (0 until width).reverse
// (738 / 10^0) % 10 = 8
// (738 / 10^1) % 10 = 3
// (738 / 10^2) % 10 = 7
charIdx = ((id / (aSizePower(i))) % aSize).toInt
} yield alph(charIdx)
r.mkString("")
}
override def hasNext: Boolean = pos < total
override def next(): String = {
val s = stringFor(pos)
pos = pos + 1
s
}
}
def main(args: Array[String]): Unit = {
// create all possible words with the given alphabet
val canonicWordSet = canonic(Seq('a', 'b', 'c'), 8)
// formal regular language definition
val languageDef: Regex = "a*b".r
// determine words of language by filtering the canocic set.
val wordsOfLanguage = canonicWordSet.filter(word => languageDef.pattern.matcher(word).matches)
println(wordsOfLanguage.toList)
}
}
canonic
but with high memory requirementsobject SO {
import scala.util.matching.Regex
/**
* Given a sequence of characters (e.g. Seq('a', 'b', 'c') )
* generates all combinations up to lneght of n (incl.).
*
* @param alphabet sequence of characters
* @param n is the max length
* @return all combinations of up to length n.
*/
def canonic(alphabet:Seq[Char], n: Int): Seq[String] = {
def combination( input: Seq[String], chars: Seq[Char]) = {
for {
i <- input
c <- chars
} yield (i+c)
}
@tailrec
def rec(left: Int, current: Seq[String], accum: Seq[String] ) : Seq[String] = {
left match {
case 0 => accum
case _ => {
val next = combination( current, alphabet )
rec( left-1, next, accum ++ next )
}
}
}
rec(n, Seq(""), Seq(""))
}
def main(args: Array[String]) : Unit = {
// create all possible words with the given alphabet
val canonicWordSet= canonic( Seq('a', 'b', 'c'), 3)
// formal regular language definition
val languageDef: Regex = "a*b".r
// determine words of language by filtering the canocic set.
val wordsOfLanguage = canonicWordSet.filter( word => languageDef.pattern.matcher(word).matches )
println( wordsOfLanguage.toList )
}
}
canonic
not working correctlydef canonic(alphabet:Seq[Char], n: Int): Iterator[String] = {
for {
i <- (0 to n).iterator
combi <- alphabet.combinations(i).map(cs => cs.mkString)
} yield combi
}