EdgarDataScientist's picture
Upload 10 files
e0356a0 verified
# File: src/generate_data.py
import pandas as pd
import random
import uuid
from faker import Faker
fake = Faker()
def generate_customer_record():
customer_id = str(uuid.uuid4())
age = random.randint(18, 80)
gender = random.choice(["Male", "Female", "Other"])
income = round(random.uniform(20000, 150000), 2)
purchase_frequency = random.randint(1, 100)
avg_spend = round(random.uniform(10, 2000), 2)
churn_risk = random.choice(["Low", "Medium", "High"])
# Sector and sector‑specific details
sector = random.choice(["Retail", "E-commerce", "Banking", "Telecom", "Travel"])
if sector in ["Retail", "E-commerce"]:
product_category = random.choice(["Electronics", "Fashion", "Home", "Sports", "Beauty"])
avg_rating = round(random.uniform(1, 5), 1)
cart_abandon_rate = round(random.uniform(0, 0.5), 2)
extra = {"Sector": sector, "ProductCategory": product_category,
"AvgRating": avg_rating, "CartAbandonRate": cart_abandon_rate}
elif sector == "Banking":
credit_score = random.randint(300, 850)
num_transactions = random.randint(10, 200)
extra = {"Sector": sector, "CreditScore": credit_score,
"NumTransactions": num_transactions}
elif sector == "Telecom":
monthly_data = round(random.uniform(0.5, 50), 2)
call_minutes = random.randint(100, 3000)
extra = {"Sector": sector, "MonthlyDataGB": monthly_data,
"CallMinutes": call_minutes}
elif sector == "Travel":
trips_per_year = random.randint(0, 15)
loyalty_tier = random.choice(["Bronze", "Silver", "Gold", "Platinum"])
extra = {"Sector": sector, "TripsPerYear": trips_per_year,
"LoyaltyTier": loyalty_tier}
else:
extra = {"Sector": sector}
record = {
"CustomerID": customer_id,
"Age": age,
"Gender": gender,
"Income": income,
"PurchaseFrequency": purchase_frequency,
"AvgSpend": avg_spend,
"ChurnRisk": churn_risk
}
record.update(extra)
return record
def generate_dataset(num_records=30000):
records = [generate_customer_record() for _ in range(num_records)]
df = pd.DataFrame(records)
# Save CSV to the data folder
df.to_csv("data\\customers.csv", index=False)
print(f" Generated {num_records} customer records and saved to ../data/customers.csv")
if __name__ == "__main__":
generate_dataset()