# SCRIPT SETUP 1: Import necessary libraries
import pandas as pd
import pyodbc
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
# SCRIPT SETUP 2: Database connection details for SQL Server
# Replace with your actual server name database name and credentials
DB_CONFIG = {
'driver': '{ODBC Driver 17 for SQL Server}',
'server': 'YOUR_SQL_SERVER_NAME', # e.g., 'localhost' or 'SERVER_NAME\SQLEXPRESS'
'database': 'YOUR_DATABASE_NAME',
'uid': 'YOUR_USERNAME',
'pwd': 'YOUR_PASSWORD'
}
# FUNCTION 1: Connect to the database
def connect_to_db():
"""Establishes a connection to the SQL Server database."""
conn_str = (
f"DRIVER={DB_CONFIG['driver']};"
f"SERVER={DB_CONFIG['server']};"
f"DATABASE={DB_CONFIG['database']};"
f"UID={DB_CONFIG['uid']};"
f"PWD={DB_CONFIG['pwd']};"
)
try:
conn = pyodbc.connect(conn_str)
print("Successfully connected to SQL Server.")
return conn
except pyodbc.Error as ex:
sqlstate = ex.args[0]
print(f"Database connection error: {sqlstate}")
print(ex)
return None
# FUNCTION 2: Fetch aggregated user behavior data
def fetch_user_behavior_data(conn):
"""
Fetches aggregated user behavior data for clustering.
This query calculates total sessions, page views, and estimated time on site per user.
"""
query = """
WITH UserSessionMetrics AS (
SELECT
e.user_pseudo_id,
e.session_id,
MIN(e.event_timestamp) AS session_start_timestamp,
MAX(e.event_timestamp) AS session_end_timestamp,
COUNT(DISTINCT e.page_location) AS unique_pages_visited_in_session,
COUNT(e.event_name) AS total_events_in_session,
MAX(CASE WHEN e.event_name = 'page_view' THEN 1 ELSE 0 END) AS has_page_view
FROM
events e
GROUP BY
e.user_pseudo_id, e.session_id
)
SELECT
usm.user_pseudo_id,
COUNT(usm.session_id) AS total_sessions,
SUM(CASE WHEN usm.has_page_view = 1 THEN 1 ELSE 0 END) AS total_page_views, -- Simplified, ideally sum of actual page_view events
AVG(CAST((usm.session_end_timestamp - usm.session_start_timestamp) AS DECIMAL(18,2)) / 1000000.0) AS avg_session_duration_seconds,
SUM(CASE WHEN usm.total_events_in_session = 1 AND usm.has_page_view = 1 THEN 1 ELSE 0 END) AS bounced_sessions,
SUM(usm.unique_pages_visited_in_session) AS total_unique_pages_visited -- Sum of unique pages across all sessions for a user
FROM
UserSessionMetrics usm
GROUP BY
usm.user_pseudo_id
HAVING
COUNT(usm.session_id) > 0; -- Ensure users have at least one session
"""
try:
df = pd.read_sql(query, conn)
print(f"Fetched {len(df)} user behavior records for clustering.")
return df
except Exception as e:
print(f"Error fetching user behavior data: {e}")
return None
# FUNCTION 3: Perform User Clustering
def perform_user_clustering(df_user_data, n_clusters=3):
"""
Performs K-Means clustering on user behavior data.
"""
if df_user_data is None or df_user_data.empty:
print("No user data to perform clustering.")
return None
# STEP 3.1: Select features for clustering
# Ensure all selected features are numeric. Handle potential NaNs by filling with 0 or mean.
features = [
'total_sessions',
'total_page_views',
'avg_session_duration_seconds',
'bounced_sessions',
'total_unique_pages_visited'
]
# Fill any NaN values in features, e.g., with 0 or mean
df_user_data[features] = df_user_data[features].fillna(0)
X = df_user_data[features]
# STEP 3.2: Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Features scaled successfully.")
# STEP 3.3: Determine optimal number of clusters (Elbow Method - for visualization)
# This part is for demonstrating how to find optimal K, not for automatic selection
# It's good practice to run this separately and then decide on n_clusters
wcss = [] # Within-cluster sum of squares
max_k = min(10, len(X)) # Don't try more clusters than data points
if max_k > 1:
for i in range(1, max_k):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10)
kmeans.fit(X_scaled)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(8, 5))
plt.plot(range(1, max_k), wcss, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS')
plt.grid(True)
plt.show()
print("Elbow method plot displayed. Choose optimal K based on the plot.")
else:
print("Not enough data points to perform Elbow Method for optimal K.")
# STEP 3.4: Apply K-Means clustering with chosen n_clusters
# Here, n_clusters is passed as an argument, you would typically choose this after Elbow Method
if len(X) < n_clusters:
print(f"Cannot create {n_clusters} clusters with only {len(X)} data points. Adjust n_clusters.")
return None
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init=10)
df_user_data['cluster'] = kmeans.fit_predict(X_scaled)
print(f"Clustering completed with {n_clusters} clusters.")
# STEP 3.5: Analyze cluster characteristics
cluster_summary = df_user_data.groupby('cluster')[features].mean()
print("\n--- Cluster Characteristics (Average Feature Values) ---")
print(cluster_summary)
# Optional: Count users in each cluster
cluster_counts = df_user_data['cluster'].value_counts().sort_index()
print("\n--- Number of Users per Cluster ---")
print(cluster_counts)
return df_user_data, cluster_summary, scaler # Return scaler for potential inverse transform if needed
# MAIN EXECUTION 1: This block runs when the script starts
if __name__ == "__main__":
# MAIN EXECUTION 2: Connect to the database
conn = connect_to_db()
if conn:
# MAIN EXECUTION 3: Fetch user behavior data
user_data_df = fetch_user_behavior_data(conn)
# MAIN EXECUTION 4: Perform user clustering
if user_data_df is not None and not user_data_df.empty:
# You might adjust n_clusters based on the Elbow Method output
final_df, summary, data_scaler = perform_user_clustering(user_data_df, n_clusters=3)
if final_df is not None:
print("\n--- User Segmentation Analysis Completed ---")
# You can now use 'final_df' (with 'cluster' column) and 'summary' for further analysis,
# e.g., saving to CSV or visualizing with more complex plots.
# final_df.to_csv('E:/SankalanAnalytics/data/processed/user_segments.csv', index=False)
# MAIN EXECUTION 5: Close the database connection
conn.close()
print("Database connection closed.")
else:
print("Could not establish database connection. Exiting.")