Having trouble reading a csv file with embedded commas in some values

I am using clean dataset from this kaggle, for a school project and i have to write code in c++ that sorts the data and allows me to run statistical analysis on the data.

https://www.kaggle.com/datasets/elvinrustam/ebay-laptops-and-netbooks-sales/data

I've done a lot of trial and error but i cant seem to figure out how to ignore the commas and random quotes within the data using c++. I try storing the the data from each heading to a vector so that i can do operations on the data after but when i print the data i see that there is other data getting mixed up in the data sets and randoms commas, brackets, long pieces of text, etc.

#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <string>
#include <set>

std::vector<std::string> parseCSVLine(const std::string& line) {
    std::vector<std::string> result;
    std::string cell;
    bool inQuotes = false;
    bool inBulletPoint = false; // New flag to track when we're within a bullet point
    for (auto it = line.begin(); it != line.end(); ++it) {
        const char nextChar = *it;

        // Check for bullet points
        if (!inQuotes && *it == '•') {
            inBulletPoint = true; // We're now inside a bullet point
            cell += nextChar; // Add the bullet point character to the cell
            continue;
        }

        // If we're in a bullet point, check for the end of the line or a comma (end of cell)
        if (inBulletPoint && (nextChar == ',' || it == line.end() - 1)) {
            inBulletPoint = false; // Exiting bullet point mode
            if (nextChar != ',') {
                cell += nextChar; // Ensure last character is included if not a comma
            }
            result.push_back(cell);
            cell.clear();
            continue;
        }
        else if (inBulletPoint) {
            // Simply add the character to the cell without interpreting it
            cell += nextChar;
            continue;
        }

        // Handle quotes (outside of bullet points)
        if (nextChar == '"') {
            if (inQuotes && (it + 1 != line.end()) && (*(it + 1) == '"')) {
                cell += nextChar; // Add a single quote to the cell value
                ++it; // Skip the next quote
            }
            else {
                inQuotes = !inQuotes;
            }
        }
        else if (nextChar == ',' && !inQuotes) {
            result.push_back(cell);
            cell.clear();
        }
        else {
            cell += nextChar;
        }
    }
    // Only check the last character if the line is not empty
    if (!cell.empty() || (!line.empty() && line.back() == ',')) {
        result.push_back(cell);
    }


    return result;
}

int main() {
    std::string filePath = "insert file path here";
    std::ifstream file(filePath);
    if (!file.is_open()) {
        std::cerr << "Failed to open file: " << filePath << std::endl;
        return 1;
    }

    std::string line;
    std::vector<std::string> headers;
    std::vector<std::vector<std::string>> columnData;

    if (getline(file, line)) {
        headers = parseCSVLine(line);
        columnData.resize(headers.size());
    }

    while (getline(file, line)) {
        auto data = parseCSVLine(line);
        for (size_t i = 0; i < data.size() && i < columnData.size(); ++i) {
            columnData[i].push_back(data[i]);
        }
    }

    file.close();

    //// Example output: Printing unique values for each heading for verification
    //for (size_t i = 0; i < headers.size(); ++i) {
    //    std::set<std::string> uniqueValues(columnData[i].begin(), columnData[i].end());
    //    std::cout << "Heading: " << headers[i] << " - Unique Values: " << uniqueValues.size() << std::endl;
    //    for (const auto& value : uniqueValues) {
    //        std::cout << value << std::endl;
    //    }
    //    std::cout << std::endl;
    //}


    // Make sure to define and fill 'columnData' and 'headers' as per your CSV parsing logic before this snippet

    // Here, the index is set to 2 since vector indices are 0-based and we want the third column (heading 3)
    size_t index = 2;

    // Check if the index is within the bounds of the 'columnData' vector
    if (index < columnData.size()) {
        std::cout << "Values under Heading 3 (" << headers[index] << "):" << std::endl;

        // Iterate over the vector at the given index and print each value
        for (const auto& value : columnData[index]) {
            std::cout << value << std::endl;
        }
    }
    else {
        std::cerr << "Index out of range. The columnData does not have a heading 3." << std::endl;
    }

    return 0;
}

Solution

To simplify the answer assumes.

There are no escape characters.
That the code only uses double quotes (")
That double quotes are the first character and last character in a cell.
=> open is first character on the line or first character after comma.
=> close is last character on the line or immediately followed by comma.
Not all cells use quotes but all cells that have commas use quotes.

A quick scan of the OA input data seem to indicate thses factors hold. But if the data changes you will need to revervirify.

Note: Scanning a generic CSV these conditions may not hold and you should use a real parser. But a well-behaved CSV file with these conditions this should suffice.

I would split this into two problems.

Scanning a Line
Scanning a Value (Cell).

So I would do it like this:

Note: Not code reviewed, so may have some mistakes. Please validate with tests.

#include <vector>
#include <iostream>
#include <fstream>
#include <sstream>

struct Value
{
    std::string     value;
    friend std::ostream& operator<<(std::ostream& s, Value const& data)
    {
        s << data.value;
        return s;
    }
    friend std::istream& operator>>(std::istream& s, Value& data)
    {
        // assumes cell ends with a ',' or eof
        // Will remove the trailing ',' from the stream but not include it in the data
        // If cell is quoted (") then removes quotes does not add them to the data.
        // Assumes quotes are first and last character in cell.

        char peek = s.peek();
        if (peek != '"') {
            std::getline(s, data.value, ',');   // drops trailing comma
        }
        else {
            char get1 = s.get();                // gets leading quote
            std::getline(s, data.value, '"');  // drops tailing quote
            char get2 = s.get();                // gets trailing comma
        }
        return s;
    }
};
struct Line
{
    std::vector<Value>        values;
    friend std::ostream& operator<<(std::ostream& s, Line const& data)
    {
        for (auto const& v: data.values) {
            s << v << ",";
        }
        return s;
    }
    friend std::istream& operator>>(std::istream& s, Line& data)
    {
        // assumes we can get a line of data with std::getline
        // All the data is in chunks of Value.

        std::string line;
        if (std::getline(s, line)) {

            std::stringstream   lineStream(std::move(line));
            std::vector<Value>  tmp{std::istream_iterator<Value>{lineStream}, std::istream_iterator<Value>{}};
            data.values = std::move(tmp);
        }
        return s;
    }
};

int main()
{
    std::ifstream   csv("EbayPcLaptopsAndNetbooksUnclean.csv");

    Line    line;
    int     count = 0;
    int     lineSizeCount[1000] = {0};
    while(csv >> line) {
        ++count;
        ++lineSizeCount[line.values.size()];
    }
    std::cout << "Lines: " << count << "\n";
    for (int loop = 0; loop < 1000; ++loop) {
        if (lineSizeCount[loop] != 0) {
            std::cout << "  Size: " << loop << " => " << lineSizeCount[loop] << "\n";
        }
    }
}