Search code examples
phphtmldomxpathdomxpath

Scraping values from multiple table cells that contain a specific class within a specific tbody tag


I would like to take data from the following tables, some I have already taken, others I just can't take them.

<table cellspacing="1" cellpadding="1" class="troop_details inAttack"
            >
                <thead>
                    <tr>
                        <td class="role">
                                                    <a href="/karte.php?d=91629">02]</a>
                                            </td>
                        <td colspan="11" class="troopHeadline">
                                                    <a class="markAttack" onclick="Travian.AttackSymbol.markAttackSymbol(29447487);return false;"><img id="markSymbol_29447487" class="markAttack markAttack0" src="/img/x.gif" title="mark attack" alt="mark attack" /></a>
                                                                        <a href="/karte.php?d=91628">Uanm attacks 01] #WorkInProgress</a>
                                            </td>
                    </tr>
                </thead>
                <tbody class="units">
                    <tr>
                        <th class="coords">
                                                    &#x202d;<span class="coordinates coordinatesWrapper coordinatesAligned coordinatesltr"><span class="coordinateX">(&#x202d;0&#x202c;</span><span class="coordinatePipe">|</span><span class="coordinateY">&#x202d;&minus;&#x202d;28&#x202c;&#x202c;)</span></span>&#x202c;                                   </th>
                                            <td class="uniticon">
                                <img class="unit u21" title="Phalanx: 0:04:17" alt="Phalanx" src="/img/x.gif" />                    </td>
                                            <td class="uniticon">
                                <img class="unit u22" title="Swordsman: 0:05:00" alt="Swordsman" src="/img/x.gif" />                    </td>
                                            <td class="uniticon">
                                <img class="unit u23" title="Pathfinder: 0:01:46" alt="Pathfinder" src="/img/x.gif" />                  </td>
                                            <td class="uniticon">
                                <img class="unit u24" title="Theutates Thunder: 0:01:35" alt="Theutates Thunder" src="/img/x.gif" />                    </td>
                                            <td class="uniticon">
                                <img class="unit u25" title="Druidrider: 0:01:53" alt="Druidrider" src="/img/x.gif" />                  </td>
                                            <td class="uniticon">
                                <img class="unit u26" title="Haeduan: 0:02:18" alt="Haeduan" src="/img/x.gif" />                    </td>
                                            <td class="uniticon">
                                <img class="unit u27" title="Ram: 0:07:30" alt="Ram" src="/img/x.gif" />                    </td>
                                            <td class="uniticon">
                                <img class="unit u28" title="Trebuchet: 0:10:00" alt="Trebuchet" src="/img/x.gif" />                    </td>
                                            <td class="uniticon">
                                <img class="unit u29" title="Chieftain: 0:06:00" alt="Chieftain" src="/img/x.gif" />                    </td>
                                            <td class="uniticon">
                                <img class="unit u30" title="Settler: 0:06:00" alt="Settler" src="/img/x.gif" />                    </td>
                                                            <td class="uniticon last">
                                <img class="unit uhero" title="Hero" alt="Hero" src="/img/x.gif" />                 </td>
                                    </tr>
                </tbody>
        
                <tbody class="units last">
                    <tr>
                        <th>Troops</th>
                                                            <td class="unit">
                                                            1                                           </td>
                                            <td class="unit none">
                                                            0                                           </td>
                                            <td class="unit none">
                                                            0                                           </td>
                                            <td class="unit none">
                                                            0                                           </td>
                                            <td class="unit none">
                                                            0                                           </td>
                                            <td class="unit none">
                                                            0                                           </td>
                                            <td class="unit none">
                                                            0                                           </td>
                                            <td class="unit none">
                                                            0                                           </td>
                                            <td class="unit none">
                                                            0                                           </td>
                                            <td class="unit none">
                                                            0                                           </td>
                                            <td class="unit none last">
                                                            0                                           </td>
                                    </tr>
                </tbody>
        
                
                <tbody class="infos">
                    <tr>
                        <th>Arrival</th>
                        <td colspan="11">
                            <div class="in">in&nbsp;<span  class="timer" counting="down" value="246">0:04:06</span>&nbsp;hrs.</div>
                            <div class="at"><span>at&nbsp;14:23:09</span><span> </span></div>
                        </td>
                    </tr>
                </tbody>
            </table>

In particular, I am interested in the coordinates and the units:

0
-28
1 0 0 0 0 0 0 0 0 0 0

To take the other parts of the html, I am using DOMXPath class, but in this case I cannot access that data.

EDIT: This is my code currently, I was able to get some of the data.

How can I remove the last character of the y coordinate? ie the parenthesis ")".

And how can I get the unit values? i.e. 1 0 0 0 0 0 0 etc.

$dom = new DOMDocument;
libxml_use_internal_errors(true);
$dom->loadHTML($CasermaUtente);
$xpath = new DOMXPath($dom);
$texts = [];
$count = 0;
foreach ($xpath->query("//table[contains(@class, 'troop_details') and contains(@class, 'inAttack')]") as $table) {
  $tablePath = $table->getNodePath();
   $texts[] = [
        $xpath->query($tablePath . "//td[@class='troopHeadline']//a[@href]/text()")[0]->nodeValue
  ];
  $orario[] = [
        substr($xpath->query($tablePath . "//div[@class='at']/span[starts-with(text(), 'at')]/text()")[0]->nodeValue, -8)
  ];
  $xcoord[] = [
        mb_substr($xpath->query($tablePath . "//span[@class='coordinateX']/text()")[0]->nodeValue,1)
  ];
  $ycoord[] = [
        $xpath->query($tablePath . "//span[@class='coordinateY']/text()")[0]->nodeValue
  ];
$count = $count + 1;
}

?>

<table width="350px" border="1">
<?php
for ($i=0; $i<$count ; $i++) { ?>
<tr><td align="center"> <?php $stampa_xcoord = implode($xcoord[$i]);
echo $stampa_xcoord; ?>
</td>
<td align="center"> <?php $stampa_ycoord = implode($ycoord[$i]);
echo $stampa_ycoord; ?>
<td align="center"><?php $stampa_stringa = implode($texts[$i]);
echo $stampa_stringa; ?>
</td>
<td align="center"> <?php $stampa_orario = implode($orario[$i]);
echo $stampa_orario; }?>
</table>

My output is something like that:

enter image description here


Solution

  • You will need to create an inner loop to pick up all of the unit values.

    strstr() with a 3rd parameter of true is a good technique to use when you want to isolate the substring before the first occurence of another substring. This is reliable if ] is guaranteed to exist in the text strings. If that symbol is not guaranteed to exist, then explode() may be more oppropriate (in which chase you unconditionally access the first element of the array that explode creates).

    Rather than echoing a lot of html markup with php variables, I like how clean the printf() syntax is.

    Code: (Demo)

    $dom = new DOMDocument;
    libxml_use_internal_errors(true);
    $dom->loadHTML($CasermaUtente);
    $xpath = new DOMXPath($dom);
    $texts = [];
    $orario = [];
    $xcoord = [];
    $ycoord = [];
    $unita = [];
    foreach ($xpath->query("//table[contains(@class, 'troop_details') and contains(@class, 'inAttack')]") as $i => $table) {
        $tablePath = $table->getNodePath();
        $texts[] = strstr($xpath->query("$tablePath//td[@class='troopHeadline']//a[@href]/text()")[0]->nodeValue, ']', true);
        $orario[] = substr($xpath->query("$tablePath//div[@class='at']/span[starts-with(text(), 'at')]/text()")[0]->nodeValue, -8);
        $xcoord[] = ltrim($xpath->query("$tablePath//span[@class='coordinateX']")[0]->nodeValue, '(');
        $ycoord[] = rtrim($xpath->query("$tablePath//span[@class='coordinateY']")[0]->nodeValue, ')');
        foreach ($xpath->query("$tablePath//tbody[contains(@class, 'units') and contains(@class, 'last')]//td[contains(@class, 'unit')]/text()") as $unit) {
            $unita[] = trim($unit->nodeValue);
        }
    }
    
    echo '<table width="350px" border="1">';
    foreach ($texts as $i => $text) {
        printf('
            <tr>
                <td align="center">%s</td>
                <td align="center">%s</td>
                <td align="center">%s</td>
                <td align="center">%s</td>
                <td align="center">%s</td>
            </tr>',
            $xcoord[$i],
            $ycoord[$i],
            $text,
            $orario[$i],
            implode(' ', $unita)
        );
        echo "\n"; // for readability in online demo
    }
    echo '</table>';
    

    Output:

    <table width="350px" border="1">
            <tr>
                <td align="center">‭0‬</td>
                <td align="center">‭−‭28‬‬</td>
                <td align="center">Uanm attacks 01</td>
                <td align="center">14:23:09</td>
                <td align="center">1 0 0 0 0 0 0 0 0 0 0</td>
            </tr>
    </table>