I am using clean dataset from this kaggle, for a school project and i have to write code in c++ that sorts the data and allows me to run statistical analysis on the data.
https://www.kaggle.com/datasets/elvinrustam/ebay-laptops-and-netbooks-sales/data
I've done a lot of trial and error but i cant seem to figure out how to ignore the commas and random quotes within the data using c++. I try storing the the data from each heading to a vector so that i can do operations on the data after but when i print the data i see that there is other data getting mixed up in the data sets and randoms commas, brackets, long pieces of text, etc.
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <string>
#include <set>
std::vector<std::string> parseCSVLine(const std::string& line) {
std::vector<std::string> result;
std::string cell;
bool inQuotes = false;
bool inBulletPoint = false; // New flag to track when we're within a bullet point
for (auto it = line.begin(); it != line.end(); ++it) {
const char nextChar = *it;
// Check for bullet points
if (!inQuotes && *it == '•') {
inBulletPoint = true; // We're now inside a bullet point
cell += nextChar; // Add the bullet point character to the cell
continue;
}
// If we're in a bullet point, check for the end of the line or a comma (end of cell)
if (inBulletPoint && (nextChar == ',' || it == line.end() - 1)) {
inBulletPoint = false; // Exiting bullet point mode
if (nextChar != ',') {
cell += nextChar; // Ensure last character is included if not a comma
}
result.push_back(cell);
cell.clear();
continue;
}
else if (inBulletPoint) {
// Simply add the character to the cell without interpreting it
cell += nextChar;
continue;
}
// Handle quotes (outside of bullet points)
if (nextChar == '"') {
if (inQuotes && (it + 1 != line.end()) && (*(it + 1) == '"')) {
cell += nextChar; // Add a single quote to the cell value
++it; // Skip the next quote
}
else {
inQuotes = !inQuotes;
}
}
else if (nextChar == ',' && !inQuotes) {
result.push_back(cell);
cell.clear();
}
else {
cell += nextChar;
}
}
// Only check the last character if the line is not empty
if (!cell.empty() || (!line.empty() && line.back() == ',')) {
result.push_back(cell);
}
return result;
}
int main() {
std::string filePath = "insert file path here";
std::ifstream file(filePath);
if (!file.is_open()) {
std::cerr << "Failed to open file: " << filePath << std::endl;
return 1;
}
std::string line;
std::vector<std::string> headers;
std::vector<std::vector<std::string>> columnData;
if (getline(file, line)) {
headers = parseCSVLine(line);
columnData.resize(headers.size());
}
while (getline(file, line)) {
auto data = parseCSVLine(line);
for (size_t i = 0; i < data.size() && i < columnData.size(); ++i) {
columnData[i].push_back(data[i]);
}
}
file.close();
//// Example output: Printing unique values for each heading for verification
//for (size_t i = 0; i < headers.size(); ++i) {
// std::set<std::string> uniqueValues(columnData[i].begin(), columnData[i].end());
// std::cout << "Heading: " << headers[i] << " - Unique Values: " << uniqueValues.size() << std::endl;
// for (const auto& value : uniqueValues) {
// std::cout << value << std::endl;
// }
// std::cout << std::endl;
//}
// Make sure to define and fill 'columnData' and 'headers' as per your CSV parsing logic before this snippet
// Here, the index is set to 2 since vector indices are 0-based and we want the third column (heading 3)
size_t index = 2;
// Check if the index is within the bounds of the 'columnData' vector
if (index < columnData.size()) {
std::cout << "Values under Heading 3 (" << headers[index] << "):" << std::endl;
// Iterate over the vector at the given index and print each value
for (const auto& value : columnData[index]) {
std::cout << value << std::endl;
}
}
else {
std::cerr << "Index out of range. The columnData does not have a heading 3." << std::endl;
}
return 0;
}
To simplify the answer assumes.
A quick scan of the OA input data seem to indicate thses factors hold. But if the data changes you will need to revervirify.
Note: Scanning a generic CSV these conditions may not hold and you should use a real parser. But a well-behaved CSV file with these conditions this should suffice.
I would split this into two problems.
So I would do it like this:
Note: Not code reviewed, so may have some mistakes. Please validate with tests.
#include <vector>
#include <iostream>
#include <fstream>
#include <sstream>
struct Value
{
std::string value;
friend std::ostream& operator<<(std::ostream& s, Value const& data)
{
s << data.value;
return s;
}
friend std::istream& operator>>(std::istream& s, Value& data)
{
// assumes cell ends with a ',' or eof
// Will remove the trailing ',' from the stream but not include it in the data
// If cell is quoted (") then removes quotes does not add them to the data.
// Assumes quotes are first and last character in cell.
char peek = s.peek();
if (peek != '"') {
std::getline(s, data.value, ','); // drops trailing comma
}
else {
char get1 = s.get(); // gets leading quote
std::getline(s, data.value, '"'); // drops tailing quote
char get2 = s.get(); // gets trailing comma
}
return s;
}
};
struct Line
{
std::vector<Value> values;
friend std::ostream& operator<<(std::ostream& s, Line const& data)
{
for (auto const& v: data.values) {
s << v << ",";
}
return s;
}
friend std::istream& operator>>(std::istream& s, Line& data)
{
// assumes we can get a line of data with std::getline
// All the data is in chunks of Value.
std::string line;
if (std::getline(s, line)) {
std::stringstream lineStream(std::move(line));
std::vector<Value> tmp{std::istream_iterator<Value>{lineStream}, std::istream_iterator<Value>{}};
data.values = std::move(tmp);
}
return s;
}
};
int main()
{
std::ifstream csv("EbayPcLaptopsAndNetbooksUnclean.csv");
Line line;
int count = 0;
int lineSizeCount[1000] = {0};
while(csv >> line) {
++count;
++lineSizeCount[line.values.size()];
}
std::cout << "Lines: " << count << "\n";
for (int loop = 0; loop < 1000; ++loop) {
if (lineSizeCount[loop] != 0) {
std::cout << " Size: " << loop << " => " << lineSizeCount[loop] << "\n";
}
}
}