I have this following XML File. I would like to get the values of first Hsp_qseq, Hsp_hseq and Hsp_midline under tag HSP in VB.NET from the file out.xml
<?xml version="1.0"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>blastn</BlastOutput_program>
<BlastOutput_version>BLASTN 2.2.25+</BlastOutput_version>
<BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference>
<BlastOutput_db>positive_Controls</BlastOutput_db>
<BlastOutput_query-ID>Query_1</BlastOutput_query-ID>
<BlastOutput_query-def>rs8192709_C Positive Contol Common Sequence</BlastOutput_query-def>
<BlastOutput_query-len>249</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_expect>10</Parameters_expect>
<Parameters_sc-match>1</Parameters_sc-match>
<Parameters_sc-mismatch>-2</Parameters_sc-mismatch>
<Parameters_gap-open>0</Parameters_gap-open>
<Parameters_gap-extend>0</Parameters_gap-extend>
<Parameters_filter>L;m;</Parameters_filter>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>Query_1</Iteration_query-ID>
<Iteration_query-def>rs8192709_C Positive Contol Common Sequence</Iteration_query-def>
<Iteration_query-len>249</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gnl|BL_ORD_ID|0</Hit_id>
<Hit_def>rs8192709_C Positive Contol Common Sequence</Hit_def>
<Hit_accession>0</Hit_accession>
<Hit_len>249</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>460.936057665848</Hsp_bit-score>
<Hsp_score>249</Hsp_score>
<Hsp_evalue>9.74431021697707e-133</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>249</Hsp_query-to>
<Hsp_hit-from>1</Hsp_hit-from>
<Hsp_hit-to>249</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>1</Hsp_hit-frame>
<Hsp_identity>249</Hsp_identity>
<Hsp_positive>249</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>249</Hsp_align-len>
<Hsp_qseq>GGTCAGGATAAAAGGCCCAGTTGGAGGCTGCAGCAGGGTGCAGGGCAGTCAGACCAGGACCATGGAACTCAGCGTCCTCCTCTTCCTTGCACTCCTCACAGGACTCTTGCTACTCCTGGTTCAGCGCCACCCTAACACCCATGACCGCCTCCCACCAGGGCCCCGCCCTCTGCCCCTTTTGGGAAACCTTCTGCAGATGGATAGAAGAGGCCTACTCAAATCCTTTCTGAGGGTAAGACACAGACGAAT</Hsp_qseq>
<Hsp_hseq>GGTCAGGATAAAAGGCCCAGTTGGAGGCTGCAGCAGGGTGCAGGGCAGTCAGACCAGGACCATGGAACTCAGCGTCCTCCTCTTCCTTGCACTCCTCACAGGACTCTTGCTACTCCTGGTTCAGCGCCACCCTAACACCCATGACCGCCTCCCACCAGGGCCCCGCCCTCTGCCCCTTTTGGGAAACCTTCTGCAGATGGATAGAAGAGGCCTACTCAAATCCTTTCTGAGGGTAAGACACAGACGAAT</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>gnl|BL_ORD_ID|29</Hit_id>
<Hit_def>rs8192709_R Positive Control Rare Sequence </Hit_def>
<Hit_accession>29</Hit_accession>
<Hit_len>249</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>455.396108708835</Hsp_bit-score>
<Hsp_score>246</Hsp_score>
<Hsp_evalue>4.53358655933358e-131</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>249</Hsp_query-to>
<Hsp_hit-from>1</Hsp_hit-from>
<Hsp_hit-to>249</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>1</Hsp_hit-frame>
<Hsp_identity>248</Hsp_identity>
<Hsp_positive>248</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>249</Hsp_align-len>
<Hsp_qseq>GGTCAGGATAAAAGGCCCAGTTGGAGGCTGCAGCAGGGTGCAGGGCAGTCAGACCAGGACCATGGAACTCAGCGTCCTCCTCTTCCTTGCACTCCTCACAGGACTCTTGCTACTCCTGGTTCAGCGCCACCCTAACACCCATGACCGCCTCCCACCAGGGCCCCGCCCTCTGCCCCTTTTGGGAAACCTTCTGCAGATGGATAGAAGAGGCCTACTCAAATCCTTTCTGAGGGTAAGACACAGACGAAT</Hsp_qseq>
<Hsp_hseq>GGTCAGGATAAAAGGCCCAGTTGGAGGCTGCAGCAGGGTGCAGGGCAGTCAGACCAGGACCATGGAACTCAGCGTCCTCCTCTTCCTTGCACTCCTCACAGGACTCTTGCTACTCCTGGTTCAGTGCCACCCTAACACCCATGACCGCCTCCCACCAGGGCCCCGCCCTCTGCCCCTTTTGGGAAACCTTCTGCAGATGGATAGAAGAGGCCTACTCAAATCCTTTCTGAGGGTAAGACACAGACGAAT</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>58</Statistics_db-num>
<Statistics_db-len>24590</Statistics_db-len>
<Statistics_hsp-len>15</Statistics_hsp-len>
<Statistics_eff-space>5550480</Statistics_eff-space>
<Statistics_kappa>0.46</Statistics_kappa>
<Statistics_lambda>1.28</Statistics_lambda>
<Statistics_entropy>0.85</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>
</BlastOutput_iterations>
</BlastOutput>
I am trying the following code but I don't know how many times I have call the .Read function.
Private Sub ReadFromXML()
Dim m_xmlr As XmlTextReader
Dim xmlnode As XmlNodeList
Form2.Visible = True
Try
'Load the Xml file
m_xmlr = New XmlTextReader("C:\Program Files\NCBI\blast-2.2.25+\bin\similarity\out.xml")
m_xmlr.WhitespaceHandling = WhitespaceHandling.None
m_xmlr.Read()
m_xmlr.Read()
While Not m_xmlr.EOF
m_xmlr.Read()
m_xmlr.Read()
m_xmlr.Read()
m_xmlr.Read()
m_xmlr.Read()
'Dim qseq = m_xmlr.ReadElementString("Hsp_qseq")
Dim hseq = m_xmlr.ReadElementString("Hsp_hseq")
Dim midline = m_xmlr.ReadElementString("Hsp_midline")
MsgBox(hseq)
End While
Catch ex As Exception
MsgBox(ex.Message)
End Try
m_xmlr.Close()
End Sub
Or is there a better way to do this?
Thanks
I'd use XPath to pull out the information you need from the file since it allows you to query for exactly the nodes you need.
XPath queries can be quite hairy looking, but for simple operations it's fairly easy to get started with. Here's some sample code that pulls out the values of those nodes you mentioned using XPath and prints their values to the console:
Imports System.Xml.XPath
Imports System.IO
Module Module1
Sub Main()
Using File As New FileStream("C:\out.xml", FileMode.Open, FileAccess.Read)
Dim Doc As New XPathDocument(File)
Dim Nav = Doc.CreateNavigator()
'Select and output the value of the Hsp_qseq nodes in the file.
Dim QSeqNodes = Nav.Select("//BlastOutput/BlastOutput_iterations/Iteration/Iteration_hits/Hit/Hit_hsps/Hsp/Hsp_qseq")
While QSeqNodes.MoveNext()
Console.WriteLine("Hsp_qseq: {0}", QSeqNodes.Current.Value)
End While
'Select and output the value of the Hsp_hseq nodes in the file.
Dim HSeqNodes = Nav.Select("//BlastOutput/BlastOutput_iterations/Iteration/Iteration_hits/Hit/Hit_hsps/Hsp/Hsp_hseq")
While HSeqNodes.MoveNext()
Console.WriteLine("Hsp_hseq: {0}", HSeqNodes.Current.Value)
End While
'Select and output the value of the Hsp_midline nodes in the file.
Dim MidlineNodes = Nav.Select("//BlastOutput/BlastOutput_iterations/Iteration/Iteration_hits/Hit/Hit_hsps/Hsp/Hsp_midline")
While MidlineNodes.MoveNext()
Console.WriteLine("Hsp_midline: {0}", MidlineNodes.Current.Value)
End While
Console.Read()
End Using
End Sub
End Module
The only interesting part of the code above is the Dim Foo = Nav.Select("...")
bits, the argument is the query expression to query for the info you want - in this case it's a simple path from the root down to the node you're after, but it is possilbe to use much more powerful queries to execute.
This returns an iterator for each matched node, so then it's just a case of iterating through and processing each node that's returned.