Search code examples
javabioinformaticsbiojava

Java Bioinformatics - get all indexes of multiple specific words in a string


I have a project in my Bioinformatics course at the university, and one of the things in my project is gene prediction.

My problem today is how to get all indexes of more than one specific word in a string. For example, in my case here I want to find all occurrences of start codons ("AUG") and stop codons ("UAA","UAG", "UGA") and use them to predict genes, simply trying to do Open Reading Frame (ORF)

Here is my initial code:

private void jButton3ActionPerformed(java.awt.event.ActionEvent evt) {                                         
    // TODO add your handling code here:
    //   textArea1.setText(null);\
    String str = jTextField1.getText(), y = "", gene = "", dnax = "", text = "";
    SymbolList dna = null;
    int start_codon_index = -1, stop_codon_index = -1;
if ("".equals(str)) {
    jTextArea1.setText("No DNA strand entered.. ");

} else {
    if (checksum(str) == 100) {
        try {
            dna = DNATools.createDNA(str);
        } catch (IllegalSymbolException ex) {
            Logger.getLogger(m.class.getName()).log(Level.SEVERE, null, ex);
        }
        try {
            dna = DNATools.toRNA(dna);
        } catch (IllegalAlphabetException ex) {
            Logger.getLogger(m.class.getName()).log(Level.SEVERE, null, ex);
        }
        dnax = dna.seqString().toUpperCase();
        if (dnax.length() % 3 != 0) {
            if (dnax.length() % 3 == 1) {
                dnax += "-";
            }
            if (dnax.length() % 3 == 2) {
                dnax += "-";
            }
        }
        //  System.out.println(dnax);
        for (int g = 0; g < dnax.length(); g += 3) {
            y = dnax.substring(g, g + 3);
            if ("AUG".equals(y)) {
                start_codon_index = g;
            } else if (start_codon_index != -1 && ("UGA".equals(y) || "UAG".equals(y) || "UAA".equals(y))) {

                stop_codon_index = g + 3;

            }
        }

        if (stop_codon_index != -1 && start_codon_index != -1) {
            String k = "";
            int a = 0;
            for (a = start_codon_index; a < stop_codon_index; a++) {
                gene += dnax.charAt(a);

            }
            text += "\nGene start position:  " + start_codon_index + "\nGene end position:  " + a + "\n Gene: " + gene;
            jTextArea1.setText(text);

        } else {

            jTextArea1.setText("No genes found in Seq: " + dnax);

        }
    } else {
        jTextArea1.setText("Text entered is not a DNA strand..");
    }
}
}

Here is the checksum() method:

private static int checksum(String x) {
    int i = 0, checks = 0, count = 0;
    char c;
    x = x.toUpperCase();
    while (i < x.length()) {
        c = x.charAt(i);
        if (c == 'A' || c == 'T' || c == 'G' || c == 'C' || c == '-') {



    count++;
    }
    i++;
}
try {
    checks = (count / x.length()) * 100;
} catch (Exception e) {
    e.printStackTrace();
}

return checks;
}

I've tried other solutions, but nothing is working for me. Any help/suggestion is welcome.


Solution

  • I think you are asking how to find the indexes of those specific codons? And dnax is the String you are checking?

    You could use indexOf(String str, int fromIndex). It returns -1 if no substring was found.

    So maybe something like this might help,

    List<Integer> startCodonIndices = new ArrayList<Integer>();
    int index;
    for (int i=0; i+3<dnax.length(); i++) {
        index = indexOf("AUG", i);
        startCodonIndices.add(index);
    }