Source code for universal_ml_framework.utils.data_generator

# DATA GENERATOR
# Generate synthetic datasets for testing and demonstration

import pandas as pd
import numpy as np


[docs]
class DataGenerator:
    """Generate synthetic datasets for various ML problems"""
    

[docs]
    @staticmethod
    def generate_house_prices(n_samples=1000, save_to_csv=True):
        """Generate synthetic house prices dataset"""
        np.random.seed(42)
        
        data = {
            'LotArea': np.random.normal(10000, 3000, n_samples),
            'YearBuilt': np.random.randint(1950, 2020, n_samples),
            'BedroomAbvGr': np.random.randint(1, 6, n_samples),
            'BathroomAbvGr': np.random.randint(1, 4, n_samples),
            'GarageArea': np.random.normal(500, 200, n_samples),
            'Neighborhood': np.random.choice(['Downtown', 'Suburb', 'Rural'], n_samples),
            'HouseStyle': np.random.choice(['1Story', '2Story', 'Split'], n_samples),
            'HasPool': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
            'HasGarage': np.random.choice([0, 1], n_samples, p=[0.3, 0.7])
        }
        
        # Create realistic price
        price = (
            data['LotArea'] * 0.01 +
            (2020 - data['YearBuilt']) * -100 +
            data['BedroomAbvGr'] * 15000 +
            data['BathroomAbvGr'] * 10000 +
            data['GarageArea'] * 50 +
            data['HasPool'] * 25000 +
            data['HasGarage'] * 15000 +
            np.random.normal(0, 20000, n_samples)
        )
        
        data['SalePrice'] = np.maximum(price, 50000)
        
        df = pd.DataFrame(data)
        
        if save_to_csv:
            train_size = int(0.8 * len(df))
            train_df = df[:train_size].copy()
            test_df = df[train_size:].drop('SalePrice', axis=1)
            
            train_df.to_csv('data/house_train.csv', index=False)
            test_df.to_csv('data/house_test.csv', index=False)
            
            print("✅ House prices dataset generated")
            print(f"   Train: {train_df.shape}")
            print(f"   Test: {test_df.shape}")
        
        return df

    

[docs]
    @staticmethod
    def generate_customer_churn(n_samples=800, save_to_csv=True):
        """Generate synthetic customer churn dataset"""
        np.random.seed(123)
        
        data = {
            'Age': np.random.randint(18, 80, n_samples),
            'MonthlyCharges': np.random.normal(65, 20, n_samples),
            'TotalCharges': np.random.normal(2000, 1500, n_samples),
            'Tenure': np.random.randint(1, 72, n_samples),
            'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
            'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_samples),
            'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
            'HasPhoneService': np.random.choice([0, 1], n_samples, p=[0.3, 0.7]),
            'HasMultipleLines': np.random.choice([0, 1], n_samples, p=[0.6, 0.4]),
            'SeniorCitizen': np.random.choice([0, 1], n_samples, p=[0.8, 0.2])
        }
        
        # Create churn based on realistic factors
        churn_prob = (
            (data['MonthlyCharges'] > 80) * 0.3 +
            (data['Tenure'] < 12) * 0.4 +
            (np.array(data['Contract']) == 'Month-to-month') * 0.3 +
            data['SeniorCitizen'] * 0.2 +
            np.random.random(n_samples) * 0.3
        )
        
        data['Churn'] = (churn_prob > 0.5).astype(int)
        
        df = pd.DataFrame(data)
        
        if save_to_csv:
            train_size = int(0.8 * len(df))
            train_df = df[:train_size].copy()
            test_df = df[train_size:].drop('Churn', axis=1)
            
            train_df.to_csv('data/customer_train.csv', index=False)
            test_df.to_csv('data/customer_test.csv', index=False)
            
            print("✅ Customer churn dataset generated")
            print(f"   Train: {train_df.shape}")
            print(f"   Test: {test_df.shape}")
        
        return df

    

[docs]
    @staticmethod
    def generate_sales_forecasting(n_samples=600, save_to_csv=True):
        """Generate synthetic sales forecasting dataset"""
        np.random.seed(456)
        
        data = {
            'Month': np.random.randint(1, 13, n_samples),
            'DayOfWeek': np.random.randint(1, 8, n_samples),
            'Temperature': np.random.normal(20, 10, n_samples),
            'Humidity': np.random.normal(60, 15, n_samples),
            'WindSpeed': np.random.normal(10, 5, n_samples),
            'Holiday': np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
            'Promotion': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]),
            'StoreType': np.random.choice(['Mall', 'Street', 'Online'], n_samples),
            'CompetitorDistance': np.random.normal(2000, 1000, n_samples)
        }
        
        # Create sales based on realistic factors
        sales = (
            1000 +
            (data['Month'] == 12) * 500 +
            (np.array(data['DayOfWeek']).isin([6, 7])) * 300 +
            data['Temperature'] * 10 +
            data['Holiday'] * 800 +
            data['Promotion'] * 400 +
            -data['CompetitorDistance'] * 0.1 +
            np.random.normal(0, 200, n_samples)
        )
        
        data['Sales'] = np.maximum(sales, 100)
        
        df = pd.DataFrame(data)
        
        if save_to_csv:
            train_size = int(0.8 * len(df))
            train_df = df[:train_size].copy()
            test_df = df[train_size:].drop('Sales', axis=1)
            
            train_df.to_csv('data/sales_train.csv', index=False)
            test_df.to_csv('data/sales_test.csv', index=False)
            
            print("✅ Sales forecasting dataset generated")
            print(f"   Train: {train_df.shape}")
            print(f"   Test: {test_df.shape}")
        
        return df

    

[docs]
    @staticmethod
    def generate_all_datasets():
        """Generate all synthetic datasets"""
        print("🔄 Generating all synthetic datasets...")
        print("-" * 40)
        
        DataGenerator.generate_house_prices()
        DataGenerator.generate_customer_churn()
        DataGenerator.generate_sales_forecasting()
        
        print("\n✅ All datasets generated successfully!")
        print("Ready to use with the pipeline framework.")