I wrote a minimal CSV parser for my machine learning toy project.
class CSV {
public:
using data_type = std::variant<int32_t, float, std::string>;
using row_type = std::vector<data_type>;
private:
std::vector<row_type> rows;
std::vector<row_type> columns;
std::vector<std::string> column_names;
std::vector<int> column_data_types;
public:
CSV(std::istream& is) {
ParseCSV(is);
}
int32_t to_int(const std::string& str) {
int32_t result = 0;
auto [ptr, ec] = std::from_chars(str.data(), str.data() + str.size(), result);
return result;
}
float to_float(const std::string& str) {
float result = 0;
auto [ptr, ec] = std::from_chars(str.data(), str.data() + str.size(), result);
return result;
}
data_type to_data(const std::string& str, std::size_t col_index) {
switch (column_data_types[col_index]) {
case 0:
return to_int(str);
break;
case 1:
return to_float(str);
break;
case 2:
return str;
break;
default:
float result = 0.0f;
auto [ptr, ec] = std::from_chars(str.data(), str.data() + str.size(), result);
if (ec == std::errc::invalid_argument) { // this data type is a string
column_data_types[col_index] = 2;
return str;
} else if (result == std::floor(result)) { // this may be an integer
column_data_types[col_index] = 0;
return to_int(str);
} else {
column_data_types[col_index] = 1;
return result;
}
break;
}
assert(0);
return {};
}
void ParseHeader(std::istream& is, std::streamsize& len) {
constexpr std::streamsize buf_len = 1024;
std::array<char, buf_len> buffer = {0};
std::string cell;
while (len > 0) {
std::streamsize read_len = std::min(len, buf_len);
is.getline(&buffer[0], read_len);
for (int i = 0; i < read_len; ++i) {
if (buffer[i] == ',') {
column_names.push_back(std::move(cell));
cell = "";
} else if (buffer[i] == '\n' || buffer[i] == '\0') {
column_names.push_back(std::move(cell));
cell = "";
len -= i;
column_data_types.resize(column_names.size(), -1);
return;
} else {
cell += buffer[i];
}
}
len -= read_len;
}
column_data_types.resize(column_names.size(), -1);
}
void ParseCSV(std::istream& is) {
is.seekg(0, std::ios::end);
std::streamsize len = is.tellg();
is.seekg(0, std::ios::beg);
auto t1 = std::chrono::steady_clock::now();
ParseHeader(is, len);
constexpr std::streamsize buf_len = 1024;
std::array<char, buf_len> buffer = {0};
std::string cell;
row_type row;
std::size_t col_index = 0;
while (len > 0) {
std::streamsize read_len = std::min(len, buf_len);
is.read(&buffer[0], read_len);
for (int i = 0; i < read_len; ++i) {
if (buffer[i] == ',') {
row.push_back(to_data(cell, col_index));
cell = "";
col_index++;
} else if (buffer[i] == '\n') {
row.push_back(to_data(cell, col_index));
cell = "";
col_index = 0;
rows.push_back(std::move(row));
row = {};
} else {
cell += buffer[i];
}
}
len -= read_len;
}
auto t2 = std::chrono::steady_clock::now();
auto dt = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
std::cout << rows.size() << " element read in " << dt.count() << " us\n";
}
};
Reading 20628 lines from https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz gives, in my machine:
20628 element read in 89734 us
Feel free to comment anything!
rowsand forcolumns. Does it hold the same data twice? \$\endgroup\$