I am trying to sparse the data from https://www.thesaurus.com/ HTML to get synonyms of and word,
Here I identified the u1 tag class as used to get the list in "li" tag as highlighted in the above image I want to parse the list in the u1 tag, so I used it in the doc.select as in below code
String url = "https://www.thesaurus.com/browse/hell";
Document doc = Jsoup.connect(url).get();
Elements data = doc.select("u1.css-17d6qyx-WordGridLayoutBox et6tpn80");
int size = data.size();
Log.d(LOG_TAG, "doc: "+doc);
Log.d(LOG_TAG, "data: "+data);
Log.d(LOG_TAG, "size"+size);
for (int i = 0; i < size; i++) {
String synonym = data.select("span.css-1y6i96q-WordGridItemBox etbu2a32")
.select("a")
.eq(i)
.attr("href");
parseItems.add(new ParseItem(synonym));
Log.d("items", " synonym: " + synonym);
Below is the result I got,
2020-07-08 14:22:52.561 16508-16576/com.example.scraping_synonyms D/MainActivity: doc: <!doctype html>
<html lang="en" prefix="og: http://ogp.me/ns#">
<head>
<meta charSet="utf-8">
<title>Hell Synonyms, Hell Antonyms | Thesaurus.com</title>
<meta name="description" content="Synonyms for hell at Thesaurus.com with free online thesaurus, antonyms, and definitions. Find descriptive alternatives for hell. ">
<meta property="og:title" content="Synonyms of hell | Thesaurus.com">
<meta property="og:description" content="Synonyms for hell from Thesaurus.com, the world’s leading online source for synonyms, antonyms, and more.">
<meta property="og:image" content="https://www.thesaurus.com/assets/thesaurus_social_logo-4b42f0643b92eaf85fc0e4e78aa84a8d.png">
<meta property="og:site_name" content="www.thesaurus.com">
<meta property="twitter:card" content="summary">
<meta property="twitter:site" content="@Dictionarycom">
<meta property="fb:app_id" content="118269238218175">
<meta property="fb:admins" content="100000304287730,109125464873">
<meta name="msvalidate.01" content="DF5542D7723770377E9ABFF59AC1DC97">
<link rel="icon" type="image/png" href="https://www.thesaurus.com/assets/favicon-54545f5303fccb956af394ac10f1655d.png">
<link rel="canonical" href="https://www.thesaurus.com/browse/hell">
<link rel="preload" href="https://www.thesaurus.com/assets/dictionary-font-bdb88a42d7d4dd71d0a4be54bda1f59a.woff" as="font" type="font/woff" crossorigin="anonymous">
<link rel="preconnect" href="//ads.pubmatic.com" crossorigin>
<link rel="preconnect" href="//securepubads.g.doubleclick.net" crossorigin>
<link rel="preconnect" href="//cm.g.doubleclick.net" crossorigin>
<link rel="preconnect" href="//pagead2.googlesyndication.com" crossorigin>
<link rel="preconnect" href="//ib.adnxs.com" crossorigin>
<link rel="preconnect" href="//ce.lijit.com" crossorigin>
<link rel="preconnect" href="//ap.lijit.com" crossorigin>
<link rel="preconnect" href="//us-u.openx.net" crossorigin>
<link rel="dns-prefetch" href="//fastlane.rubiconproject.com">
<link rel="dns-prefetch" href="//pixel.rubiconproject.com">
<link rel="dns-prefetch" href="//token.rubiconproject.com">
<link rel="dns-prefetch" href="//dsum-sec.casalemedia.com">
<link rel="dns-prefetch" href="//as-sec.casalemedia.com">
<link rel="dns-prefetch" href="//gum.criteo.com">
<link rel="dns-prefetch" href="//static.criteo.net">
<link rel="dns-prefetch" href="//bidder.criteo.com">
<link rel="dns-prefetch" href="//adserver.adtech.advertising.com">
<link rel="dns-prefetch" href="//cdata.carambo.la">
<script>
!function(n,e){var t,o,i,c=[],f={passive:!0,capture:!0},r=new Date,a="pointerup",u="pointercancel";function p(n,c){t||(t=c,o=n,i=new Date,w(e),s())}function s(){o>=0&&o<i-r&&(c.forEach(function(n){n(o,t)}),c=[])}function l(t){if(t.cancelable){var o=(t.timeStamp>1e12?new Date:performance.now())-t.timeStamp;"pointerdown"==t.type?function(t,o){function i(){p(t,o),r()}function c(){r()}function r(){e(a,i,f),e(u,c,f)}n(a,i,f),n(u,c,f)}(o,t):p(o,t)}}function w(n){["click","mousedown","keydown","touchstart","pointerdown"].forEach(function(e){n(e,l,f)})}w(n),self.perfMetrics=self.perfMetrics||{},self.perfMetrics.onFirstInputDelay=function(n){c.push(n),s()}}(addEventListener,removeEventListener);
perfMetrics.onFirstInputDelay(function(delay, evt) {
window.dataLayer = window.dataLayer || [];
dataLayer.push({
event: 'FID',
action: evt.type,
value: delay
});
});
</script>
<script defer src="https://www.thesaurus.com/assets/client-4e944b31.js"></script>
<script defer src="https://www.thesaurus.com/assets/react-456bf542.js"></script>
<script defer src="https://www.thesaurus.com/assets/vendor~253ae210-5a6e2153.js"></script>
<script defer src="https://www.thesaurus.com/assets/vendor~7274e1de-00d240c3.js"></script>
<script defer src="https://www.thesaurus.com/assets/vendor~7d359b94-5bc2d791.js"></script>
<script defer src="https://www.thesaurus.com/assets/vendor~b9cf3951-fea1f433.js"></script>
<!-- Ad integration with callbac
2020-07-08 14:22:52.562 16508-16576/com.example.scraping_synonyms D/MainActivity: data:
2020-07-08 14:22:52.562 16508-16576/com.example.scraping_synonyms D/MainActivity: size0
I am new at this data scraping from jsoup. Could anyone explain what I did wrong?
If you know the CSS class for the elements you want to extract, it is easily done with getElementsByClass
:
Elements data = doc.getElementsByClass("etbu2a31");
//here we have a collection of all the elements that have etbu2a31 class on them
for (Element e: data){
System.out.println(" synonym URL: " + e.attr("href"));
System.out.println(" synonym text: " + e.text());
}