Search code examples
phpjsonfile-get-contents

Scraping file to convert to json


I am not getting any output from the bottom half as I was expecting. I can grab the top table's data, but I am also trying to grab the bottom table data and place encode them into json. The columns I need to grab are

1. Week Date Home Away At Notes

<?php

libxml_use_internal_errors(true);

$doc = new DOMDocument();
$doc->loadHTML(file_get_contents('https://www.leagueleader.net/sharedreport.php?operatorid=98&code=bc155b01-7492-412d-aa75-3c1e357248f1'));
$doc->strictErrorChecking = false;

$pre = [];
$keys = ['team', 'div', 'team-site-name', 'site-address', 'site-phone'];
$keys2 = ['week', 'date', 'home', 'away', 'at', 'notes'];
foreach ($doc->getElementsByTagName('table') as $k => $table) {

    if (strpos($table->getAttribute('class'), 'report') === false) {
        continue;
    }
    foreach ($table->getElementsByTagName('tr') as $i => $tr) {
        if ($tr->parentNode->nodeName === 'thead') continue; // skip headers 
        $row_values = [];
        foreach ($tr->childNodes as $td) {
            $text = trim($td->nodeValue);
            if ($text === '') continue;
            $row_values[] = $text;
        }

        if($k == 1 ){


            $row_values = array_combine($keys, $row_values);

        }else   if($k == 2 ){
            unset($row_values[1]);
        $row_values = array_combine($keys2, $row_values);

        }
          $pre[$row_values['name']][] = $row_values;

    }

}
$combined = [];
foreach($pre as $week => $row){
    $combined[$name] = [
        "week"=> $week,
        "team"=> $row[0]['team'],
        "div"=> $row[0]['div'],
        "team-site-name" => $row[0]['team-site-name'],
        "site-address" => $row[0]['site-address'],
        "site-phone" => $row[0]['site-phone'],

        //"week" => $row[1]['week'],
        "date" => $row[1]['date'],
        "home" => $row[1]['home'],
        "away" => $row[1]['away'],
        "at" => $row[1]['at'],
        "notes" => $row[1]['notes']
    ];
}

echo '<pre>'.json_encode($combined, JSON_PRETTY_PRINT).'</pre>';
?>

Here is the output

{  
    "": {  
        "week": "",  
        "team": "1",  
        "div": "A",  
        "team-site-name": "Team 01Freer Bar",  
        "site-address": "\u00a07355 Michigan Ave Detroit, MI 48210",  
        "site-phone": "\u00a03138993699",  
        "date": null,  
        "home": null,  
        "away": null,  
        "at": null,  
        "notes": null  
    }  
}  

Solution

  • To get the data from the second table with the matches, I've changed the processing to use XPath. This extracts the <tr> tags from the body of the second table with class='report' (using //table[@class='report'][2]/tbody/tr).

    So this will return all of the rows in the body of the table. Then extract all of the <td> elements and pick out the details in the row. If there is a week/date present it just overwrites the current data, if there are match details it creates a row on the output...

    $xpath = new DOMXPath($doc);
    $reportRow = $xpath->query("//table[@class='report'][2]/tbody/tr");
    $matches = [];
    $week = '';
    $date = '';
    foreach ($reportRow as $row) {
        $cells = $row->getElementsByTagName("td");
        // Set week and date if present in the current row
        $week = trim($cells[0]->textContent)?:$week;
        $date = trim($cells[1]->textContent)?:$date;
        // Extract the other details
        $teamHome = trim($cells[2]->textContent);
        $teamAway = trim($cells[3]->textContent);
        $at = trim($cells[4]->textContent);
        $notes = trim($cells[5]->textContent);
    
        // If there are some match details, the store them
        if ( !empty($teamHome) )    {
            $matches[] = ["week" => $week, "date" => $date,
                "teamHome" =>$teamHome, "teamAway" =>$teamAway,
                "at" => $at, "notes" => $notes
            ];
        }
    }
    print_r($matches);
    

    This gives...

    Array
    (
        [0] => Array
            (
                [week] => 1
                [date] => 09/10/2019
                [teamHome] => Team 01
                [teamAway] => BYE
                [at] => BYE
                [notes] => 
            )