# SCRIPT SETUP 1: Import necessary libraries
import pandas as pd
import pyodbc
# SCRIPT SETUP 2: Database connection details for SQL Server
# Replace with your actual server name database name and credentials
DB_CONFIG = {
'driver': '{ODBC Driver 17 for SQL Server}',
'server': 'YOUR_SQL_SERVER_NAME', # e.g., 'localhost' or 'SERVER_NAME\SQLEXPRESS'
'database': 'YOUR_DATABASE_NAME',
'uid': 'YOUR_USERNAME',
'pwd': 'YOUR_PASSWORD'
}
# FUNCTION 1: Connect to the database
def connect_to_db():
"""Establishes a connection to the SQL Server database."""
conn_str = (
f"DRIVER={DB_CONFIG['driver']};"
f"SERVER={DB_CONFIG['server']};"
f"DATABASE={DB_CONFIG['database']};"
f"UID={DB_CONFIG['uid']};"
f"PWD={DB_CONFIG['pwd']};"
)
try:
conn = pyodbc.connect(conn_str)
print("Successfully connected to SQL Server.")
return conn
except pyodbc.Error as ex:
sqlstate = ex.args[0]
print(f"Database connection error: {sqlstate}")
print(ex)
return None
# FUNCTION 2: Fetch event data
def fetch_event_data(conn):
"""Fetches necessary event data from the SQL Server events table."""
query = """
SELECT
user_pseudo_id,
session_id,
event_name,
device_category,
traffic_source,
traffic_medium,
geo_country,
geo_city
FROM
events;
"""
try:
df = pd.read_sql(query, conn)
print(f"Fetched {len(df)} event records for behavior analysis.")
return df
except Exception as e:
print(f"Error fetching event data: {e}")
return None
# FUNCTION 3: Analyze User Behavior by Segment
def analyze_user_behavior_by_segment(df_events):
"""Calculates key metrics for user behavior segmented by device, source, and location."""
if df_events is None or df_events.empty:
print("No event data to analyze user behavior.")
return None
# Ensure relevant columns are strings and fill missing values
df_events['device_category'] = df_events['device_category'].fillna('unknown').astype(str)
df_events['traffic_source'] = df_events['traffic_source'].fillna('unknown').astype(str)
df_events['traffic_medium'] = df_events['traffic_medium'].fillna('unknown').astype(str)
df_events['geo_country'] = df_events['geo_country'].fillna('unknown').astype(str)
df_events['geo_city'] = df_events['geo_city'].fillna('unknown').astype(str)
# STEP 3.1: Calculate metrics by Device Category
print("\n--- User Behavior by Device Category ---")
device_behavior = df_events.groupby('device_category').agg(
total_users=('user_pseudo_id', 'nunique'),
total_sessions=('session_id', 'nunique'),
total_page_views=('event_name', lambda x: (x == 'page_view').sum())
).reset_index()
# Calculate bounce rate for device category
# Get event count per session
events_per_session = df_events.groupby(['user_pseudo_id', 'session_id']).size().reset_index(name='event_count')
# Merge event count back to df_events
df_events_with_counts = pd.merge(df_events, events_per_session, on=['user_pseudo_id', 'session_id'], how='left')
# Identify bounced sessions (single event sessions that were page views)
bounced_sessions_device = df_events_with_counts[
(df_events_with_counts['event_count'] == 1) & (df_events_with_counts['event_name'] == 'page_view')
].groupby('device_category')['session_id'].nunique().reset_index(name='bounced_sessions')
device_behavior = pd.merge(device_behavior, bounced_sessions_device, on='device_category', how='left').fillna(0)
device_behavior['bounce_rate'] = (device_behavior['bounced_sessions'] / device_behavior['total_sessions']) * 100
device_behavior['bounce_rate'] = device_behavior['bounce_rate'].fillna(0).round(2)
print(device_behavior.sort_values('total_sessions', ascending=False))
# STEP 3.2: Calculate metrics by Traffic Source and Medium
print("\n--- User Behavior by Traffic Source / Medium ---")
source_medium_behavior = df_events.groupby(['traffic_source', 'traffic_medium']).agg(
total_users=('user_pseudo_id', 'nunique'),
total_sessions=('session_id', 'nunique'),
total_page_views=('event_name', lambda x: (x == 'page_view').sum())
).reset_index()
# Calculate bounce rate for source/medium
bounced_sessions_source_medium = df_events_with_counts[
(df_events_with_counts['event_count'] == 1) & (df_events_with_counts['event_name'] == 'page_view')
].groupby(['traffic_source', 'traffic_medium'])['session_id'].nunique().reset_index(name='bounced_sessions')
source_medium_behavior = pd.merge(source_medium_behavior, bounced_sessions_source_medium, on=['traffic_source', 'traffic_medium'], how='left').fillna(0)
source_medium_behavior['bounce_rate'] = (source_medium_behavior['bounced_sessions'] / source_medium_behavior['total_sessions']) * 100
source_medium_behavior['bounce_rate'] = source_medium_behavior['bounce_rate'].fillna(0).round(2)
print(source_medium_behavior.sort_values('total_sessions', ascending=False).head(10)) # Show top 10
# STEP 3.3: Calculate metrics by Geo Country
print("\n--- User Behavior by Geo Country ---")
country_behavior = df_events.groupby('geo_country').agg(
total_users=('user_pseudo_id', 'nunique'),
total_sessions=('session_id', 'nunique'),
total_page_views=('event_name', lambda x: (x == 'page_view').sum())
).reset_index()
# Calculate bounce rate for country
bounced_sessions_country = df_events_with_counts[
(df_events_with_counts['event_count'] == 1) & (df_events_with_counts['event_name'] == 'page_view')
].groupby('geo_country')['session_id'].nunique().reset_index(name='bounced_sessions')
country_behavior = pd.merge(country_behavior, bounced_sessions_country, on='geo_country', how='left').fillna(0)
country_behavior['bounce_rate'] = (country_behavior['bounced_sessions'] / country_behavior['total_sessions']) * 100
country_behavior['bounce_rate'] = country_behavior['bounce_rate'].fillna(0).round(2)
print(country_behavior.sort_values('total_sessions', ascending=False).head(10)) # Show top 10
# STEP 3.4: Calculate metrics by Geo City (optional, can be very granular)
# This can be very detailed, so only show top few cities
print("\n--- User Behavior by Geo City (Top 10) ---")
city_behavior = df_events.groupby('geo_city').agg(
total_users=('user_pseudo_id', 'nunique'),
total_sessions=('session_id', 'nunique'),
total_page_views=('event_name', lambda x: (x == 'page_view').sum())
).reset_index()
# Calculate bounce rate for city
bounced_sessions_city = df_events_with_counts[
(df_events_with_counts['event_count'] == 1) & (df_events_with_counts['event_name'] == 'page_view')
].groupby('geo_city')['session_id'].nunique().reset_index(name='bounced_sessions')
city_behavior = pd.merge(city_behavior, bounced_sessions_city, on='geo_city', how='left').fillna(0)
city_behavior['bounce_rate'] = (city_behavior['bounced_sessions'] / city_behavior['total_sessions']) * 100
city_behavior['bounce_rate'] = city_behavior['bounce_rate'].fillna(0).round(2)
print(city_behavior.sort_values('total_sessions', ascending=False).head(10))
return device_behavior, source_medium_behavior, country_behavior, city_behavior
# MAIN EXECUTION 1: This block runs when the script starts
if __name__ == "__main__":
# MAIN EXECUTION 2: Connect to the database
conn = connect_to_db()
if conn:
# MAIN EXECUTION 3: Fetch event data
events_df = fetch_event_data(conn)
# MAIN EXECUTION 4: Analyze user behavior
if events_df is not None:
device_bhv, source_medium_bhv, country_bhv, city_bhv = analyze_user_behavior_by_segment(events_df)
if device_bhv is not None:
print("\n--- User Behavior Analysis Completed ---")
# You can now use these DataFrames for plotting or further analysis
# For example, to save to new CSVs:
# device_bhv.to_csv('E:/SankalanAnalytics/data/processed/device_behavior.csv', index=False)
# source_medium_bhv.to_csv('E:/SankalanAnalytics/data/processed/source_medium_behavior.csv', index=False)
# country_bhv.to_csv('E:/SankalanAnalytics/data/processed/country_behavior.csv', index=False)
# MAIN EXECUTION 5: Close the database connection
conn.close()
print("Database connection closed.")
else:
print("Could not establish database connection. Exiting.")