Search code examples
phpregexcpu-wordpreg-splitdelimited

Get first three words from up to 10 comma-delimited phrases in a string


In the processPage function below, I'm grabbing the keywords from the keywords metatag of each URL processed. I need to alter the preg_split so that it only pulls the first three words of any keyword cluster.

For example, given this keywords meta tag:

<meta name="keywords" content="this is too long, this is not, keyword three" />

I only want the "this is too" part of the first keyword cluster.

Also, if the total list of keyword phrases is longer than 10, I only want to pull the first 10 keyword phrases from the list.

ie, (keyword phrase 1, kw 2, kw 3, kw4, etc..., keyword phrase 10)

<?php

class ResultPage
{
    function __construct($siteurl){$this->url = $siteurl;$this->processPage();}
    
    public $url;
    public $title;
    public $html;
    public $plainText;
    public $wordList;
    public $keywords = array();
    
    function processPage(){
        $this->html = rseo_keywordSearch_scrapePage($this->url);
        $dom = str_get_html($this->html);
        $metakws = $dom->find('meta[name=keywords]');
        if(count($metakws)){
            $metakw = $metakws[0];
            if($metakw->content){
                $this->keywords = preg_split("/[\s]*[,][\s]*/",$metakw->content); //EDIT HERE
                }
            }
        }
    
    public function GetResults(){
        return rseo_keyword_getCountArray($this->wordList);
    }
}


/*
 * 
 * Calls remote web page using cUrl, 
 * and returns the raw html
 * 
 */
function rseo_keywordSearch_scrapePage($url, $headonly = TRUE ){
    
    $agents = 'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.204 Safari/534.16';
    
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_VERBOSE, FALSE);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
    curl_setopt($ch, CURLOPT_USERAGENT, $agents);
    //curl_setopt($ch, CURLOPT_NOBODY, $headonly);
    curl_setopt($ch, CURLOPT_URL, $url);
    
    $curlResp = curl_exec($ch);
    curl_close($ch);
    $resp = str_replace("class=l","class='l'",$curlResp);

    return $resp;
}

function rseo_keyword_getCountArray($arr){
    $retarr = array_count_values($arr);
    arsort($retarr);
    return $retarr;
}

Solution

  • This is a bit easier matching rather than splitting, e.g.:

    preg_match_all('/(?<=^|,)\s*((?:[^\s,]+\s*){1,3})/', $metakw->content, $m);
    $this->keywords = array_slice($m[1], 0, 10);
    
    print_r($this->keywords);
    
    /*
    Array
        (
            [0] => this is too 
            [1] => this is not
            [2] => keyword three
        )
    */