Search code examples
javaweb-scrapingjsoup

How to scrape currency formatted numbers in CSV file through Java


I have created a web scraper which scrapes data from a website and store it in CSV file. But, the problem is there is a column on the website which have currency formatted values which have amounts like 7,100 or 85,210. When my code executed and scrapes the data, these values separated into two columns like 7 in one column and 100 in other column. Kindly, check the attached screenshots. Code is as follows.

public class ComMarket_summary {

boolean writeCSVToConsole = true;
boolean writeCSVToFile = true;
boolean sortTheList = true;
boolean writeToConsole;
boolean writeToFile;
public static Document doc = null;
public static Elements tbodyElements = null;
public static Elements elements = null;
public static Elements tdElements = null;
public static Elements trElement2 = null;
public static String Dcomma = ",";
public static String line = "";
public static ArrayList<Elements> sampleList = new ArrayList<Elements>();

public static void createConnection() throws IOException {
    System.setProperty("http.proxyHost", "191.1.1.202");
    System.setProperty("http.proxyPort", "8080");
    String tempUrl = "http://www.psx.com.pk/phps/mktSummary.php";
    doc = Jsoup.parse(new URL(tempUrl), 1000);
    System.out.println("Successfully Connected");
}

public static void parsingHTML() throws Exception {

    for (Element table : doc.select("table.marketData")) {
        Elements tables = doc.select("table.marketData");
        table = tables.get(2);
        File fold = new File("C:\\market_smry.csv");
        fold.delete();
        File fnew = new File("C:\\market_smry.csv");
        for (Element trElement : table.getElementsByTag("tr")) {

            trElement2 = trElement.getElementsByTag("tr");
            tdElements = trElement.getElementsByTag("td");
            FileWriter sb = new FileWriter(fnew, true);

            //if (table.hasClass("marketData")) { //&&(tdElements.hasClass("tableHead")&&tdElements.hasClass("tableSubHead"))
            for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
                if (it.hasNext()) {
                    sb.append(" , ");
                    sb.append(" \r\n ");
                }

                for (Iterator<Element> it2 = tdElements.iterator(); it.hasNext();) {
                    Element tdElement2 = it.next();
                    final String content = tdElement2.text();
                    if (it2.hasNext()) {

                        sb.append(formatData(content));
                        sb.append("   ,   ");
                        

                    }
                }

                System.out.println(sb.toString());
                sb.flush();
                sb.close();
            }

            System.out.println(sampleList.add(tdElements));

        }
    }
}
private static final SimpleDateFormat FORMATTER_MMM_d_yyyy = new SimpleDateFormat("MMM d, yyyy", Locale.US);
private static final SimpleDateFormat FORMATTER_dd_MMM_yyyy = new SimpleDateFormat("d-MMM-yy", Locale.US);

public static String formatData(String text) {
    String tmp = null;

    try {
        Date d = FORMATTER_MMM_d_yyyy.parse(text);
        tmp = FORMATTER_dd_MMM_yyyy.format(d);
    } catch (ParseException pe) {
        tmp = text;
    }

    return tmp;
}

public static void main(String[] args) throws IOException, Exception {
    createConnection();
    parsingHTML();

}

enter image description here enter image description here

Note: I am using windows 8, java version 1.8, jsoup 1.8


Solution

  • Before saving the value get rid of the comma by using String.replace

    value = value.replace (",", "");