Back

Dataframe

Overview

The Dataframe class is a collection of named Serie objects that can have different element types. It provides a convenient way to manage related data series with typed access, similar to a table or spreadsheet where each column can have a different data type.

Class Definition

Dataframe Class Definition

namespace df {

/**
 * @brief A Dataframe is a collection of series.
 */
class Dataframe {
  public:
    Dataframe() = default;
    ~Dataframe() = default;

    /**
     * @brief Add a serie to the Dataframe with the given name
     * @throws std::runtime_error if a serie with this name already exists
     */
    template <typename T>
    void add(const std::string &name, const Serie<T> &serie);

    /**
     * @brief Add a serie to the Dataframe with the given name
     */
    template <typename T>
    void add(const std::string &name, const ArrayType<T> &array);

    /**
     * Remove a serie from the Dataframe
     * @throws std::runtime_error if the serie doesn't exist
     */
    void remove(const std::string &name);

    /**
     * Get a serie by name and type
     * @throws std::runtime_error if the serie doesn't exist or if there's a
     * type mismatch
     */
    template <typename T> const Serie<T> &get(const std::string &name) const;

    /**
     * Get the type info for a serie
     * @throws std::runtime_error if the serie doesn't exist
     */
    std::type_index type(const std::string &name) const;

    /**
     * Get the type name for a serie
     * @throws std::runtime_error if the serie doesn't exist
     */
    String type_name(const std::string &name) const;

    /**
     * Check if a serie exists with the given name
     */
    bool has(const std::string &name) const;

    /**
     * Get the number of series in the Dataframe
     */
    size_t size() const;

    /**
     * Get all serie names in the Dataframe
     */
    std::vector<std::string> names() const;

    /**
     * Clear all series from the Dataframe
     */
    void clear();
};

} // namespace df

Usage

The Dataframe class provides methods to add, access, and manage multiple Series with different data types. Each Serie is identified by a unique name, and you must specify the correct type when accessing a Serie.

Basic Dataframe Usage

#include <dataframe/Dataframe.h>
#include <dataframe/Serie.h>
#include <iostream>
#include <string>

int main() {
    // Create a Dataframe
    df::Dataframe data;
    
    // Add Series with different types
    data.add("ages", df::Serie<int>{25, 32, 41, 28, 35});
    data.add("names", df::Serie<std::string>{"Alice", "Bob", "Charlie", "Diana", "Edward"});
    data.add("heights", df::Serie<double>{165.5, 180.2, 175.0, 162.8, 183.5});
    data.add("weights", df::Serie<double>{60.2, 78.5, 82.1, 58.7, 85.3});
    
    // Get all Serie names
    std::cout << "Series in Dataframe: ";
    for (const auto& name : data.names()) {
        std::cout << name << " ";
    }
    std::cout << std::endl;
    
    // Check if a Serie exists
    if (data.has("ages")) {
        std::cout << "Ages Serie exists in the Dataframe" << std::endl;
    }
    
    // Get Serie type
    std::cout << "Type of 'heights' Serie: " << data.type_name("heights") << std::endl;
    
    // Access Series with the correct type
    const auto& names = data.get<std::string>("names");
    const auto& ages = data.get<int>("ages");
    
    // Print some values
    std::cout << "Names: " << names << std::endl;
    std::cout << "Ages: " << ages << std::endl;
    
    // Remove a Serie
    data.remove("weights");
    std::cout << "After removing 'weights', number of Series: " << data.size() << std::endl;
    
    // Clear the Dataframe
    data.clear();
    std::cout << "After clearing, number of Series: " << data.size() << std::endl;
    
    return 0;
}

Working with Series in a Dataframe

Dataframes are especially useful when working with related data that needs to be processed together. You can combine data from multiple Series to perform calculations and analysis.

Advanced Dataframe Usage

#include <dataframe/Dataframe.h>
#include <dataframe/Serie.h>
#include <dataframe/map.h>
#include <dataframe/filter.h>
#include <dataframe/zip.h>
#include <iostream>
#include <string>

int main() {
    // Create a Dataframe with student data
    df::Dataframe students;
    
    // Add Series for different attributes
    students.add("names", df::Serie<std::string>{"Alice", "Bob", "Charlie", "Diana", "Edward"});
    students.add("math_scores", df::Serie<int>{85, 72, 91, 65, 78});
    students.add("science_scores", df::Serie<int>{92, 68, 83, 77, 85});
    students.add("english_scores", df::Serie<int>{78, 82, 75, 89, 70});
    
    // Calculate average scores and add as a new Serie
    auto math = students.get<int>("math_scores");
    auto science = students.get<int>("science_scores");
    auto english = students.get<int>("english_scores");
    
    auto avg_scores = df::zip(math, science, english).map([](const auto& scores, size_t) {
        auto [m, s, e] = scores;
        return (m + s + e) / 3.0;
    });
    
    students.add("avg_scores", avg_scores);
    
    // Get names of students with average score above 80
    auto good_students = df::zip(students.get<std::string>("names"), students.get<double>("avg_scores"))
        .filter([](const auto& tuple, size_t) {
            auto [_, avg] = tuple;
            return avg > 80.0;
        })
        .map([](const auto& tuple, size_t) {
            auto [name, _] = tuple;
            return name;
        });
    
    std::cout << "Students with average score above 80: " << good_students << std::endl;
    
    // Find the student with the highest math score
    int highest_math = 0;
    size_t top_student_idx = 0;
    
    students.get<int>("math_scores").forEach([&](int score, size_t idx) {
        if (score > highest_math) {
            highest_math = score;
            top_student_idx = idx;
        }
    });
    
    std::cout << "Student with highest math score: " 
              << students.get<std::string>("names")[top_student_idx]
              << " (" << highest_math << ")" << std::endl;
    
    return 0;
}

Creating Dataframes from Data Sources

Dataframes can be populated from various data sources like CSV files or JSON data using the IO functionalities.

Loading Data into a Dataframe

#include <dataframe/Dataframe.h>
#include <dataframe/Serie.h>
#include <dataframe/io/csv.h>
#include <iostream>
#include <string>

int main() {
    try {
        // Load data from a CSV file
        df::Dataframe data = df::io::read_csv("data.csv");
        
        // Print information about the loaded data
        std::cout << "Loaded " << data.size() << " series from CSV." << std::endl;
        std::cout << "Series names: ";
        for (const auto& name : data.names()) {
            std::cout << name << " ";
        }
        std::cout << std::endl;
        
        // Access specific columns by name
        if (data.has("temperature")) {
            std::cout << "Temperature data: " << data.get<double>("temperature") << std::endl;
        }
        
        if (data.has("date")) {
            std::cout << "First date: " << data.get<std::string>("date")[0] << std::endl;
        }
        
        // Process the data
        if (data.has("temperature") && data.has("humidity")) {
            // Calculate a heat index (simplified)
            auto temps = data.get<double>("temperature");
            auto humidity = data.get<double>("humidity");
            
            auto heat_index = df::zip(temps, humidity).map([](const auto& tuple, size_t) {
                auto [t, h] = tuple;
                return t + 0.05 * h;  // Simplified formula
            });
            
            // Add the new Serie to the Dataframe
            data.add("heat_index", heat_index);
            
            std::cout << "Added heat index calculations." << std::endl;
        }
        
        // Save the modified data back to CSV
        df::io::write_csv(data, "processed_data.csv");
        std::cout << "Saved processed data to CSV." << std::endl;
        
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }
    
    return 0;
}

Implementation Notes

  • Each Serie in a Dataframe is stored with type information, allowing type-safe access.
  • Series in a Dataframe can have different element types.
  • Accessing a Serie with the wrong type will throw a runtime_error.
  • Adding a Serie with a name that already exists will throw a runtime_error.
  • Removing a Serie that doesn't exist will throw a runtime_error.
  • Dataframe does not modify the Series it contains; it only stores references to them.
  • The Dataframe class uses type erasure to store Series of different types.

Related Classes and Functions