Dataframe
Overview
The Dataframe class is a collection of named Serie objects that can have
different element types.
It provides a convenient way to manage related data series with typed access, similar to a table or
spreadsheet where
each column can have a different data type.
Class Definition
namespace df {
/**
* @brief A Dataframe is a collection of series.
*/
class Dataframe {
public:
Dataframe() = default;
~Dataframe() = default;
/**
* @brief Add a serie to the Dataframe with the given name
* @throws std::runtime_error if a serie with this name already exists
*/
template <typename T>
void add(const std::string &name, const Serie<T> &serie);
/**
* @brief Add a serie to the Dataframe with the given name
*/
template <typename T>
void add(const std::string &name, const ArrayType<T> &array);
/**
* Remove a serie from the Dataframe
* @throws std::runtime_error if the serie doesn't exist
*/
void remove(const std::string &name);
/**
* Get a serie by name and type
* @throws std::runtime_error if the serie doesn't exist or if there's a
* type mismatch
*/
template <typename T> const Serie<T> &get(const std::string &name) const;
/**
* Get the type info for a serie
* @throws std::runtime_error if the serie doesn't exist
*/
std::type_index type(const std::string &name) const;
/**
* Get the type name for a serie
* @throws std::runtime_error if the serie doesn't exist
*/
String type_name(const std::string &name) const;
/**
* Check if a serie exists with the given name
*/
bool has(const std::string &name) const;
/**
* Get the number of series in the Dataframe
*/
size_t size() const;
/**
* Get all serie names in the Dataframe
*/
std::vector<std::string> names() const;
/**
* Clear all series from the Dataframe
*/
void clear();
};
} // namespace df
Usage
The Dataframe class provides methods to add, access, and manage multiple Series with different data types. Each Serie is identified by a unique name, and you must specify the correct type when accessing a Serie.
#include <dataframe/Dataframe.h>
#include <dataframe/Serie.h>
#include <iostream>
#include <string>
int main() {
// Create a Dataframe
df::Dataframe data;
// Add Series with different types
data.add("ages", df::Serie<int>{25, 32, 41, 28, 35});
data.add("names", df::Serie<std::string>{"Alice", "Bob", "Charlie", "Diana", "Edward"});
data.add("heights", df::Serie<double>{165.5, 180.2, 175.0, 162.8, 183.5});
data.add("weights", df::Serie<double>{60.2, 78.5, 82.1, 58.7, 85.3});
// Get all Serie names
std::cout << "Series in Dataframe: ";
for (const auto& name : data.names()) {
std::cout << name << " ";
}
std::cout << std::endl;
// Check if a Serie exists
if (data.has("ages")) {
std::cout << "Ages Serie exists in the Dataframe" << std::endl;
}
// Get Serie type
std::cout << "Type of 'heights' Serie: " << data.type_name("heights") << std::endl;
// Access Series with the correct type
const auto& names = data.get<std::string>("names");
const auto& ages = data.get<int>("ages");
// Print some values
std::cout << "Names: " << names << std::endl;
std::cout << "Ages: " << ages << std::endl;
// Remove a Serie
data.remove("weights");
std::cout << "After removing 'weights', number of Series: " << data.size() << std::endl;
// Clear the Dataframe
data.clear();
std::cout << "After clearing, number of Series: " << data.size() << std::endl;
return 0;
}
Working with Series in a Dataframe
Dataframes are especially useful when working with related data that needs to be processed together. You can combine data from multiple Series to perform calculations and analysis.
#include <dataframe/Dataframe.h>
#include <dataframe/Serie.h>
#include <dataframe/map.h>
#include <dataframe/filter.h>
#include <dataframe/zip.h>
#include <iostream>
#include <string>
int main() {
// Create a Dataframe with student data
df::Dataframe students;
// Add Series for different attributes
students.add("names", df::Serie<std::string>{"Alice", "Bob", "Charlie", "Diana", "Edward"});
students.add("math_scores", df::Serie<int>{85, 72, 91, 65, 78});
students.add("science_scores", df::Serie<int>{92, 68, 83, 77, 85});
students.add("english_scores", df::Serie<int>{78, 82, 75, 89, 70});
// Calculate average scores and add as a new Serie
auto math = students.get<int>("math_scores");
auto science = students.get<int>("science_scores");
auto english = students.get<int>("english_scores");
auto avg_scores = df::zip(math, science, english).map([](const auto& scores, size_t) {
auto [m, s, e] = scores;
return (m + s + e) / 3.0;
});
students.add("avg_scores", avg_scores);
// Get names of students with average score above 80
auto good_students = df::zip(students.get<std::string>("names"), students.get<double>("avg_scores"))
.filter([](const auto& tuple, size_t) {
auto [_, avg] = tuple;
return avg > 80.0;
})
.map([](const auto& tuple, size_t) {
auto [name, _] = tuple;
return name;
});
std::cout << "Students with average score above 80: " << good_students << std::endl;
// Find the student with the highest math score
int highest_math = 0;
size_t top_student_idx = 0;
students.get<int>("math_scores").forEach([&](int score, size_t idx) {
if (score > highest_math) {
highest_math = score;
top_student_idx = idx;
}
});
std::cout << "Student with highest math score: "
<< students.get<std::string>("names")[top_student_idx]
<< " (" << highest_math << ")" << std::endl;
return 0;
}
Creating Dataframes from Data Sources
Dataframes can be populated from various data sources like CSV files or JSON data using the IO functionalities.
#include <dataframe/Dataframe.h>
#include <dataframe/Serie.h>
#include <dataframe/io/csv.h>
#include <iostream>
#include <string>
int main() {
try {
// Load data from a CSV file
df::Dataframe data = df::io::read_csv("data.csv");
// Print information about the loaded data
std::cout << "Loaded " << data.size() << " series from CSV." << std::endl;
std::cout << "Series names: ";
for (const auto& name : data.names()) {
std::cout << name << " ";
}
std::cout << std::endl;
// Access specific columns by name
if (data.has("temperature")) {
std::cout << "Temperature data: " << data.get<double>("temperature") << std::endl;
}
if (data.has("date")) {
std::cout << "First date: " << data.get<std::string>("date")[0] << std::endl;
}
// Process the data
if (data.has("temperature") && data.has("humidity")) {
// Calculate a heat index (simplified)
auto temps = data.get<double>("temperature");
auto humidity = data.get<double>("humidity");
auto heat_index = df::zip(temps, humidity).map([](const auto& tuple, size_t) {
auto [t, h] = tuple;
return t + 0.05 * h; // Simplified formula
});
// Add the new Serie to the Dataframe
data.add("heat_index", heat_index);
std::cout << "Added heat index calculations." << std::endl;
}
// Save the modified data back to CSV
df::io::write_csv(data, "processed_data.csv");
std::cout << "Saved processed data to CSV." << std::endl;
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
return 0;
}
Implementation Notes
- Each Serie in a Dataframe is stored with type information, allowing type-safe access.
- Series in a Dataframe can have different element types.
- Accessing a Serie with the wrong type will throw a runtime_error.
- Adding a Serie with a name that already exists will throw a runtime_error.
- Removing a Serie that doesn't exist will throw a runtime_error.
- Dataframe does not modify the Series it contains; it only stores references to them.
- The Dataframe class uses type erasure to store Series of different types.