Source code for universal_ml_framework.utils.data_generator

# DATA GENERATOR
# Generate synthetic datasets for testing and demonstration

import pandas as pd
import numpy as np

[docs] class DataGenerator: """Generate synthetic datasets for various ML problems"""
[docs] @staticmethod def generate_house_prices(n_samples=1000, save_to_csv=True): """Generate synthetic house prices dataset""" np.random.seed(42) data = { 'LotArea': np.random.normal(10000, 3000, n_samples), 'YearBuilt': np.random.randint(1950, 2020, n_samples), 'BedroomAbvGr': np.random.randint(1, 6, n_samples), 'BathroomAbvGr': np.random.randint(1, 4, n_samples), 'GarageArea': np.random.normal(500, 200, n_samples), 'Neighborhood': np.random.choice(['Downtown', 'Suburb', 'Rural'], n_samples), 'HouseStyle': np.random.choice(['1Story', '2Story', 'Split'], n_samples), 'HasPool': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]), 'HasGarage': np.random.choice([0, 1], n_samples, p=[0.3, 0.7]) } # Create realistic price price = ( data['LotArea'] * 0.01 + (2020 - data['YearBuilt']) * -100 + data['BedroomAbvGr'] * 15000 + data['BathroomAbvGr'] * 10000 + data['GarageArea'] * 50 + data['HasPool'] * 25000 + data['HasGarage'] * 15000 + np.random.normal(0, 20000, n_samples) ) data['SalePrice'] = np.maximum(price, 50000) df = pd.DataFrame(data) if save_to_csv: train_size = int(0.8 * len(df)) train_df = df[:train_size].copy() test_df = df[train_size:].drop('SalePrice', axis=1) train_df.to_csv('data/house_train.csv', index=False) test_df.to_csv('data/house_test.csv', index=False) print("✅ House prices dataset generated") print(f" Train: {train_df.shape}") print(f" Test: {test_df.shape}") return df
[docs] @staticmethod def generate_customer_churn(n_samples=800, save_to_csv=True): """Generate synthetic customer churn dataset""" np.random.seed(123) data = { 'Age': np.random.randint(18, 80, n_samples), 'MonthlyCharges': np.random.normal(65, 20, n_samples), 'TotalCharges': np.random.normal(2000, 1500, n_samples), 'Tenure': np.random.randint(1, 72, n_samples), 'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples), 'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_samples), 'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples), 'HasPhoneService': np.random.choice([0, 1], n_samples, p=[0.3, 0.7]), 'HasMultipleLines': np.random.choice([0, 1], n_samples, p=[0.6, 0.4]), 'SeniorCitizen': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]) } # Create churn based on realistic factors churn_prob = ( (data['MonthlyCharges'] > 80) * 0.3 + (data['Tenure'] < 12) * 0.4 + (np.array(data['Contract']) == 'Month-to-month') * 0.3 + data['SeniorCitizen'] * 0.2 + np.random.random(n_samples) * 0.3 ) data['Churn'] = (churn_prob > 0.5).astype(int) df = pd.DataFrame(data) if save_to_csv: train_size = int(0.8 * len(df)) train_df = df[:train_size].copy() test_df = df[train_size:].drop('Churn', axis=1) train_df.to_csv('data/customer_train.csv', index=False) test_df.to_csv('data/customer_test.csv', index=False) print("✅ Customer churn dataset generated") print(f" Train: {train_df.shape}") print(f" Test: {test_df.shape}") return df
[docs] @staticmethod def generate_sales_forecasting(n_samples=600, save_to_csv=True): """Generate synthetic sales forecasting dataset""" np.random.seed(456) data = { 'Month': np.random.randint(1, 13, n_samples), 'DayOfWeek': np.random.randint(1, 8, n_samples), 'Temperature': np.random.normal(20, 10, n_samples), 'Humidity': np.random.normal(60, 15, n_samples), 'WindSpeed': np.random.normal(10, 5, n_samples), 'Holiday': np.random.choice([0, 1], n_samples, p=[0.9, 0.1]), 'Promotion': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]), 'StoreType': np.random.choice(['Mall', 'Street', 'Online'], n_samples), 'CompetitorDistance': np.random.normal(2000, 1000, n_samples) } # Create sales based on realistic factors sales = ( 1000 + (data['Month'] == 12) * 500 + (np.array(data['DayOfWeek']).isin([6, 7])) * 300 + data['Temperature'] * 10 + data['Holiday'] * 800 + data['Promotion'] * 400 + -data['CompetitorDistance'] * 0.1 + np.random.normal(0, 200, n_samples) ) data['Sales'] = np.maximum(sales, 100) df = pd.DataFrame(data) if save_to_csv: train_size = int(0.8 * len(df)) train_df = df[:train_size].copy() test_df = df[train_size:].drop('Sales', axis=1) train_df.to_csv('data/sales_train.csv', index=False) test_df.to_csv('data/sales_test.csv', index=False) print("✅ Sales forecasting dataset generated") print(f" Train: {train_df.shape}") print(f" Test: {test_df.shape}") return df
[docs] @staticmethod def generate_all_datasets(): """Generate all synthetic datasets""" print("🔄 Generating all synthetic datasets...") print("-" * 40) DataGenerator.generate_house_prices() DataGenerator.generate_customer_churn() DataGenerator.generate_sales_forecasting() print("\n✅ All datasets generated successfully!") print("Ready to use with the pipeline framework.")