Troubleshooting Guide
Common Issues and Solutions
Installation Issues
ImportError: No module named ‘universal_ml_framework’
# Ensure package is installed
pip install universal-ml-framework
# Or install from source
pip install -e .
Missing Dependencies
# Install all required dependencies
pip install pandas scikit-learn numpy joblib
# For Bayesian optimization (optional)
pip install scikit-optimize
Data Loading Problems
FileNotFoundError: No such file or directory
import os
# Check if file exists
if os.path.exists('your_data.csv'):
pipeline.run_pipeline('your_data.csv', 'target')
else:
print("File not found. Check the file path.")
UnicodeDecodeError when loading CSV
# Try different encodings
import pandas as pd
try:
df = pd.read_csv('data.csv', encoding='utf-8')
except UnicodeDecodeError:
df = pd.read_csv('data.csv', encoding='latin-1')
Empty DataFrame after loading
# Check data after loading
pipeline.load_data('train.csv', target_column='target')
print(f"Training data shape: {pipeline.train_df.shape}")
if pipeline.train_df.empty:
print("Warning: Empty dataset loaded")
Memory and Performance Issues
MemoryError: Unable to allocate memory
# Solution 1: Enable fast mode
pipeline = UniversalMLPipeline(fast_mode=True)
# Solution 2: Reduce parallel jobs
pipeline = UniversalMLPipeline(n_jobs=2)
# Solution 3: Sample your data
import pandas as pd
df = pd.read_csv('large_data.csv').sample(n=10000)
df.to_csv('sampled_data.csv', index=False)
Training takes too long
# Enable fast mode for quick results
pipeline = UniversalMLPipeline(
fast_mode=True, # Fewer models and CV folds
tuning_method='random', # Faster than grid search
n_jobs=-1 # Use all CPU cores
)
High CPU usage
# Limit CPU cores usage
pipeline = UniversalMLPipeline(n_jobs=2) # Use only 2 cores
Feature Detection Issues
No features detected
# Check your data
print(pipeline.train_df.dtypes)
print(pipeline.train_df.describe())
# Manually specify features if needed
custom_features = ['feature1', 'feature2', 'feature3']
pipeline.run_pipeline('data.csv', 'target', custom_features=custom_features)
Wrong feature types detected
# Override automatic detection
pipeline.feature_types = {
'numeric': ['age', 'income', 'score'],
'categorical': ['city', 'category'],
'binary': ['is_member', 'has_discount']
}
Too many categorical features after one-hot encoding
# Exclude high-cardinality categorical columns
pipeline.run_pipeline(
'data.csv',
'target',
exclude_columns=['high_cardinality_column']
)
Model Training Problems
ValueError: Input contains NaN
# Check for missing values
print(pipeline.train_df.isnull().sum())
# The pipeline should handle this automatically
# If it persists, check your custom feature engineering function
All models perform poorly
# Check data quality
print(f"Target distribution: {pipeline.train_df[target_column].value_counts()}")
# Try different problem type
pipeline = UniversalMLPipeline(problem_type='regression') # or 'classification'
# Add custom feature engineering
def improve_features(df):
# Add your feature engineering here
return df
pipeline.run_pipeline('data.csv', 'target', feature_engineering_func=improve_features)
Hyperparameter tuning fails
# Try different tuning method
pipeline = UniversalMLPipeline(tuning_method='random') # Instead of 'grid'
# Or disable tuning temporarily
pipeline.hyperparameter_tuning = lambda: None
Prediction Issues
No test data available warning
# Provide test data path
pipeline.run_pipeline('train.csv', 'target', test_path='test.csv')
# Or make predictions separately
predictions = pipeline.make_predictions(save_predictions=True)
ID column mismatch in predictions
# Specify correct ID column
pipeline.run_pipeline(
'train.csv',
'target',
'test.csv',
id_column='PassengerId' # Use your actual ID column name
)
Predictions file not generated
# Check if test data is provided
if pipeline.test_df is not None:
predictions = pipeline.make_predictions(save_predictions=True)
else:
print("No test data provided for predictions")
Error Messages and Solutions
Performance Optimization Tips
Speed Up Training
# Fastest configuration
pipeline = UniversalMLPipeline(
fast_mode=True, # Reduced model set
tuning_method='random', # Faster than grid search
n_jobs=-1, # Use all cores
verbose=False # Reduce output overhead
)
Reduce Memory Usage
# Memory-efficient configuration
pipeline = UniversalMLPipeline(
fast_mode=True, # Fewer models in memory
n_jobs=1 # Reduce parallel overhead
)
# Process data in chunks if very large
chunk_size = 10000
for chunk in pd.read_csv('large_file.csv', chunksize=chunk_size):
# Process each chunk separately
Debug Mode
# Enable verbose mode for debugging
pipeline = UniversalMLPipeline(verbose=True)
# Check intermediate results
pipeline.load_data('train.csv', target_column='target')
print("Feature types:", pipeline.feature_types)
pipeline.create_preprocessor()
pipeline.prepare_data()
print("X shape:", pipeline.X.shape)
print("y shape:", pipeline.y.shape)
Getting Help
Check Documentation
API Reference: Detailed method documentation
Examples: Working code examples
Architecture: Understanding the framework design
Common Debugging Steps
Check Data Quality
print(df.info()) print(df.describe()) print(df.isnull().sum())
Verify Configuration
print(f"Problem type: {pipeline.problem_type}") print(f"Feature types: {pipeline.feature_types}") print(f"Models: {list(pipeline.models.keys())}")
Test with Sample Data
# Test with small sample first sample_df = df.sample(n=1000) sample_df.to_csv('sample.csv', index=False)
Enable Verbose Output
pipeline = UniversalMLPipeline(verbose=True)
Report Issues
When reporting issues, please include:
Python version and operating system
Package versions (pandas, scikit-learn, etc.)
Complete error traceback
Minimal code example to reproduce the issue
Dataset characteristics (size, types, etc.)
Example Issue Report:
Environment:
- Python 3.8.10
- universal-ml-framework 1.0.1
- pandas 1.3.0
- scikit-learn 1.0.2
Issue:
MemoryError when processing 100K row dataset
Code:
pipeline = UniversalMLPipeline()
pipeline.run_pipeline('large_data.csv', 'target')
Error:
[Full traceback here]