Search code examples
xmlvb.netvisual-studio-2010visual-studio-2008blast

Parse XML file VB.NET


I have this following XML File. I would like to get the values of first Hsp_qseq, Hsp_hseq and Hsp_midline under tag HSP in VB.NET from the file out.xml

<?xml version="1.0"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">
<BlastOutput>
  <BlastOutput_program>blastn</BlastOutput_program>
  <BlastOutput_version>BLASTN 2.2.25+</BlastOutput_version>
  <BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), &quot;A greedy algorithm for aligning DNA sequences&quot;, J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference>
  <BlastOutput_db>positive_Controls</BlastOutput_db>
  <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>
  <BlastOutput_query-def>rs8192709_C Positive Contol Common Sequence</BlastOutput_query-def>
  <BlastOutput_query-len>249</BlastOutput_query-len>
  <BlastOutput_param>
    <Parameters>
      <Parameters_expect>10</Parameters_expect>
      <Parameters_sc-match>1</Parameters_sc-match>
      <Parameters_sc-mismatch>-2</Parameters_sc-mismatch>
      <Parameters_gap-open>0</Parameters_gap-open>
      <Parameters_gap-extend>0</Parameters_gap-extend>
      <Parameters_filter>L;m;</Parameters_filter>
    </Parameters>
  </BlastOutput_param>
  <BlastOutput_iterations>
    <Iteration>
      <Iteration_iter-num>1</Iteration_iter-num>
      <Iteration_query-ID>Query_1</Iteration_query-ID>
      <Iteration_query-def>rs8192709_C Positive Contol Common Sequence</Iteration_query-def>
      <Iteration_query-len>249</Iteration_query-len>
      <Iteration_hits>
        <Hit>
          <Hit_num>1</Hit_num>
          <Hit_id>gnl|BL_ORD_ID|0</Hit_id>
          <Hit_def>rs8192709_C Positive Contol Common Sequence</Hit_def>
          <Hit_accession>0</Hit_accession>
          <Hit_len>249</Hit_len>
          <Hit_hsps>
            <Hsp>
              <Hsp_num>1</Hsp_num>
              <Hsp_bit-score>460.936057665848</Hsp_bit-score>
              <Hsp_score>249</Hsp_score>
              <Hsp_evalue>9.74431021697707e-133</Hsp_evalue>
              <Hsp_query-from>1</Hsp_query-from>
              <Hsp_query-to>249</Hsp_query-to>
              <Hsp_hit-from>1</Hsp_hit-from>
              <Hsp_hit-to>249</Hsp_hit-to>
              <Hsp_query-frame>1</Hsp_query-frame>
              <Hsp_hit-frame>1</Hsp_hit-frame>
              <Hsp_identity>249</Hsp_identity>
              <Hsp_positive>249</Hsp_positive>
              <Hsp_gaps>0</Hsp_gaps>
              <Hsp_align-len>249</Hsp_align-len>
              <Hsp_qseq>GGTCAGGATAAAAGGCCCAGTTGGAGGCTGCAGCAGGGTGCAGGGCAGTCAGACCAGGACCATGGAACTCAGCGTCCTCCTCTTCCTTGCACTCCTCACAGGACTCTTGCTACTCCTGGTTCAGCGCCACCCTAACACCCATGACCGCCTCCCACCAGGGCCCCGCCCTCTGCCCCTTTTGGGAAACCTTCTGCAGATGGATAGAAGAGGCCTACTCAAATCCTTTCTGAGGGTAAGACACAGACGAAT</Hsp_qseq>
              <Hsp_hseq>GGTCAGGATAAAAGGCCCAGTTGGAGGCTGCAGCAGGGTGCAGGGCAGTCAGACCAGGACCATGGAACTCAGCGTCCTCCTCTTCCTTGCACTCCTCACAGGACTCTTGCTACTCCTGGTTCAGCGCCACCCTAACACCCATGACCGCCTCCCACCAGGGCCCCGCCCTCTGCCCCTTTTGGGAAACCTTCTGCAGATGGATAGAAGAGGCCTACTCAAATCCTTTCTGAGGGTAAGACACAGACGAAT</Hsp_hseq>
              <Hsp_midline>|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
            </Hsp>
          </Hit_hsps>
        </Hit>
        <Hit>
          <Hit_num>2</Hit_num>
          <Hit_id>gnl|BL_ORD_ID|29</Hit_id>
          <Hit_def>rs8192709_R Positive Control Rare Sequence </Hit_def>
          <Hit_accession>29</Hit_accession>
          <Hit_len>249</Hit_len>
          <Hit_hsps>
            <Hsp>
              <Hsp_num>1</Hsp_num>
              <Hsp_bit-score>455.396108708835</Hsp_bit-score>
              <Hsp_score>246</Hsp_score>
              <Hsp_evalue>4.53358655933358e-131</Hsp_evalue>
              <Hsp_query-from>1</Hsp_query-from>
              <Hsp_query-to>249</Hsp_query-to>
              <Hsp_hit-from>1</Hsp_hit-from>
              <Hsp_hit-to>249</Hsp_hit-to>
              <Hsp_query-frame>1</Hsp_query-frame>
              <Hsp_hit-frame>1</Hsp_hit-frame>
              <Hsp_identity>248</Hsp_identity>
              <Hsp_positive>248</Hsp_positive>
              <Hsp_gaps>0</Hsp_gaps>
              <Hsp_align-len>249</Hsp_align-len>
              <Hsp_qseq>GGTCAGGATAAAAGGCCCAGTTGGAGGCTGCAGCAGGGTGCAGGGCAGTCAGACCAGGACCATGGAACTCAGCGTCCTCCTCTTCCTTGCACTCCTCACAGGACTCTTGCTACTCCTGGTTCAGCGCCACCCTAACACCCATGACCGCCTCCCACCAGGGCCCCGCCCTCTGCCCCTTTTGGGAAACCTTCTGCAGATGGATAGAAGAGGCCTACTCAAATCCTTTCTGAGGGTAAGACACAGACGAAT</Hsp_qseq>
              <Hsp_hseq>GGTCAGGATAAAAGGCCCAGTTGGAGGCTGCAGCAGGGTGCAGGGCAGTCAGACCAGGACCATGGAACTCAGCGTCCTCCTCTTCCTTGCACTCCTCACAGGACTCTTGCTACTCCTGGTTCAGTGCCACCCTAACACCCATGACCGCCTCCCACCAGGGCCCCGCCCTCTGCCCCTTTTGGGAAACCTTCTGCAGATGGATAGAAGAGGCCTACTCAAATCCTTTCTGAGGGTAAGACACAGACGAAT</Hsp_hseq>
              <Hsp_midline>|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
            </Hsp>
          </Hit_hsps>
        </Hit>
      </Iteration_hits>
      <Iteration_stat>
        <Statistics>
          <Statistics_db-num>58</Statistics_db-num>
          <Statistics_db-len>24590</Statistics_db-len>
          <Statistics_hsp-len>15</Statistics_hsp-len>
          <Statistics_eff-space>5550480</Statistics_eff-space>
          <Statistics_kappa>0.46</Statistics_kappa>
          <Statistics_lambda>1.28</Statistics_lambda>
          <Statistics_entropy>0.85</Statistics_entropy>
        </Statistics>
      </Iteration_stat>
    </Iteration>
  </BlastOutput_iterations>
</BlastOutput>

I am trying the following code but I don't know how many times I have call the .Read function.

Private Sub ReadFromXML()

    Dim m_xmlr As XmlTextReader
    Dim xmlnode As XmlNodeList
    Form2.Visible = True
    Try

    'Load the Xml file

    m_xmlr = New XmlTextReader("C:\Program Files\NCBI\blast-2.2.25+\bin\similarity\out.xml")
    m_xmlr.WhitespaceHandling = WhitespaceHandling.None

    m_xmlr.Read()
    m_xmlr.Read()



    While Not m_xmlr.EOF
        m_xmlr.Read()
        m_xmlr.Read()
        m_xmlr.Read()
        m_xmlr.Read()
        m_xmlr.Read()
        'Dim qseq = m_xmlr.ReadElementString("Hsp_qseq")
        Dim hseq = m_xmlr.ReadElementString("Hsp_hseq")
        Dim midline = m_xmlr.ReadElementString("Hsp_midline")
        MsgBox(hseq) 

    End While


    Catch ex As Exception
        MsgBox(ex.Message)
    End Try
    m_xmlr.Close()
End Sub

Or is there a better way to do this?

Thanks


Solution

  • I'd use XPath to pull out the information you need from the file since it allows you to query for exactly the nodes you need.

    XPath queries can be quite hairy looking, but for simple operations it's fairly easy to get started with. Here's some sample code that pulls out the values of those nodes you mentioned using XPath and prints their values to the console:

    Imports System.Xml.XPath
    Imports System.IO
    
    Module Module1
    
        Sub Main()
    
            Using File As New FileStream("C:\out.xml", FileMode.Open, FileAccess.Read)
    
                Dim Doc As New XPathDocument(File)
                Dim Nav = Doc.CreateNavigator()
    
                'Select and output the value of the Hsp_qseq nodes in the file.
                Dim QSeqNodes = Nav.Select("//BlastOutput/BlastOutput_iterations/Iteration/Iteration_hits/Hit/Hit_hsps/Hsp/Hsp_qseq")
    
                While QSeqNodes.MoveNext()
                    Console.WriteLine("Hsp_qseq: {0}", QSeqNodes.Current.Value)
                End While
    
                'Select and output the value of the Hsp_hseq nodes in the file.
                Dim HSeqNodes = Nav.Select("//BlastOutput/BlastOutput_iterations/Iteration/Iteration_hits/Hit/Hit_hsps/Hsp/Hsp_hseq")
    
                While HSeqNodes.MoveNext()
                    Console.WriteLine("Hsp_hseq: {0}", HSeqNodes.Current.Value)
                End While
    
                'Select and output the value of the Hsp_midline nodes in the file.
                Dim MidlineNodes = Nav.Select("//BlastOutput/BlastOutput_iterations/Iteration/Iteration_hits/Hit/Hit_hsps/Hsp/Hsp_midline")
    
                While MidlineNodes.MoveNext()
                    Console.WriteLine("Hsp_midline: {0}", MidlineNodes.Current.Value)
                End While
    
                Console.Read()
    
            End Using
    
        End Sub
    
    End Module
    

    The only interesting part of the code above is the Dim Foo = Nav.Select("...") bits, the argument is the query expression to query for the info you want - in this case it's a simple path from the root down to the node you're after, but it is possilbe to use much more powerful queries to execute.

    This returns an iterator for each matched node, so then it's just a case of iterating through and processing each node that's returned.