Search code examples
javahtmlandroid-studiodomjsoup

Jsoup parsing single URL and multiple bodies (maybe fragmented)


I am trying to get a HTML page using jsoup, but I am only getting the first page. Please note that the website will only sometimes have multiple pages.

Background:
The link is for my transportation from work. Since my station is changing daily, I wanted to make an app that will let me know where my station will be today instead of browsing the list myself every day.

So the code takes the list URL and parses it with jsoup. It uses Pattern.compile to extract the date from the body element, but I have only figured out how to get the first page/first body. How can I get all of the pages/bodies?

package androidcodesnippets.me.cfsuman.com.mystation;

import android.content.Intent;
import android.content.SharedPreferences;
import android.os.Bundle;
import android.preference.PreferenceManager;
import android.support.v7.app.AppCompatActivity;
import android.view.Menu;
import android.view.MenuItem;
import android.webkit.WebView;
import android.widget.TextView;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.util.regex.Pattern;

public class MainActivity extends AppCompatActivity {
String USGS =
        "http://yit.maya-tour.co.il/yit-pass/Drop_Report.aspx? 
client_code=2660&coordinator_code=2669";
private TextView result;

@Override
protected void onCreate(Bundle savedInstanceState) {
    super.onCreate(savedInstanceState);
    setContentView(R.layout.activity_main);
    result = findViewById(R.id.result);
    WebView webView = (WebView) findViewById(R.id.webview);
    webView.setInitialScale(1);
    webView.getSettings().setJavaScriptEnabled(true);
    webView.getSettings().setLoadWithOverviewMode(true);
    webView.getSettings().setUseWideViewPort(true);
    webView.setScrollBarStyle(WebView.SCROLLBARS_OUTSIDE_OVERLAY);
    webView.setScrollbarFadingEnabled(false);
    webView.loadUrl(USGS);
    getWebsite();


}


String title;
**strong text**

private void getWebsite() {
    new Thread(new Runnable() {
        @Override
        public void run() {
            SharedPreferences sharedPrefs = PreferenceManager.getDefaultSharedPreferences(MainActivity.this);
            String category = sharedPrefs.getString(
                    getString(R.string.settings_category_key),
                    getString(R.string.settings_category_default)
            );
            final StringBuilder builder = new StringBuilder();
            try {
                Document doc = Jsoup.connect(USGS).get();
                title = doc.body().text();

                String pizur = getSentence(title, category);
                builder.append(pizur).append("\n");


            } catch (IOException e) {
                builder.append("Error : ").append(e.getMessage()).append("\n");
            }

            runOnUiThread(new Runnable() {
                @Override
                public void run() {
                    result.setText(builder.toString());
                }
            });
        }
    }).start();
}

private final Pattern END_OF_SENTENCE = Pattern.compile("16:00|20:50|17:05|08:50|17:00");

public String getSentence(String text, String word) {
    final String lcword = word.toLowerCase();
    String pizur = getString(R.string.pizur_none);
    for (String sentence : END_OF_SENTENCE.split(text)) {
        if (sentence.toLowerCase().contains(lcword)) {
            return sentence;
        }
    }
    return pizur;
}

@Override
public boolean onCreateOptionsMenu(Menu menu) {
    getMenuInflater().inflate(R.menu.main, menu);
    return true;
}

@Override
public boolean onOptionsItemSelected(MenuItem item) {
    int id = item.getItemId();
    if (id == R.id.action_settings) {
        Intent settingsIntent = new Intent(this, SettingsActivity.class);
        startActivity(settingsIntent);
        return true;
    }
    return super.onOptionsItemSelected(item);
}

}

Solution

  • You're completely missing the idea of Jsoup. You use it only to download HTML and parse with regexps. You can use it to select specific parts of HTML without regular expressions. I guess your getSentence function should look like this:

    public static String getSentence(Document doc, String word) {
        final String lcword = word.toLowerCase();
        Elements tableRows = doc.select("table#PassListView1 tr"); //selecting all rows of this table
        tableRows.remove(0); // removing first table row because it contains only headers
        Element selectedRow = null;
        for (Element row : tableRows) { // iterating through all the rows ...
            if (row.text().toLowerCase().contains(lcword)) { // ... to find the one you are looking for
                selectedRow = row;
            }
        }
        if (selectedRow == null) { //if no matching row found
            return getString(R.string.pizur_none);
        } else {
            //found matching row, so we're getting all the cells <td>
            Elements tds = selectedRow.select("td");
            for (Element td : tds) { // just displaying contents of every cell
                Log.d("TAG", td.text());
            }
            return tds.get(2).text(); // change number here to get what you want
        }
    }
    

    imports:

    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    

    I'm not sure if that's what you wanted to achieve, but it should give you the general idea.

    EDIT after taking into account advices from comments:

    I didn't know exactly what you mean because at the time I was visiting this URL there was only one page of results :) Links to pages 2,3,4... POST specific params to the server. You can observe them in your browser's debugger (network tab). Here's a function that connects to get current values of hidden inputs and POSTs request with page number:

    private Document downloadPage(int pageNumber) throws IOException {
        Document d = Jsoup.connect(USGS).get(); // first request to get <form> and it's hidden values
        Elements inputs = d.select("#form1 input");
        Connection connectionWithData = Jsoup.connect(USGS);
        for (Element input : inputs) {
            if (input.attr("name").equals("__EVENTTARGET") || input.attr("name").equals("__EVENTARGUMENT")) { // skipping these two params as we will fill them below
                continue;
            }
            connectionWithData.data(input.attr("name"), input.attr("value"));
        }
        return connectionWithData
            .data("__EVENTTARGET", "PassListView1")
            .data("__EVENTARGUMENT", "Page$" + pageNumber)
            .post();
    }
    

    You can use it like this:

    Document doc = downloadPage(5);