# SCRIPT SETUP 1: Import necessary libraries
import pandas as pd
import pyodbc
from datetime import datetime, timedelta
# SCRIPT SETUP 2: Database connection details for SQL Server
# Replace with your actual server name database name and credentials
DB_CONFIG = {
'driver': '{ODBC Driver 17 for SQL Server}',
'server': 'YOUR_SQL_SERVER_NAME', # e.g., 'localhost' or 'SERVER_NAME\SQLEXPRESS'
'database': 'YOUR_DATABASE_NAME',
'uid': 'YOUR_USERNAME',
'pwd': 'YOUR_PASSWORD'
}
# FUNCTION 1: Connect to the database
def connect_to_db():
"""Establishes a connection to the SQL Server database."""
conn_str = (
f"DRIVER={DB_CONFIG['driver']};"
f"SERVER={DB_CONFIG['server']};"
f"DATABASE={DB_CONFIG['database']};"
f"UID={DB_CONFIG['uid']};"
f"PWD={DB_CONFIG['pwd']};"
)
try:
conn = pyodbc.connect(conn_str)
print("Successfully connected to SQL Server.")
return conn
except pyodbc.Error as ex:
sqlstate = ex.args[0]
print(f"Database connection error: {sqlstate}")
print(ex)
return None
# FUNCTION 2: Fetch event data within a date range
def fetch_events_in_range(conn, start_date, end_date):
"""Fetches event data from SQL Server within a specified date range."""
# Convert dates to Unix microseconds for comparison with event_timestamp
start_timestamp_us = int(datetime.combine(start_date, datetime.min.time()).timestamp() * 1_000_000)
end_timestamp_us = int(datetime.combine(end_date + timedelta(days=1), datetime.min.time()).timestamp() * 1_000_000) - 1 # End of day
query = f"""
SELECT
user_pseudo_id,
session_id,
event_timestamp,
event_name,
page_location,
traffic_source,
traffic_medium
FROM
events
WHERE
event_timestamp >= {start_timestamp_us} AND event_timestamp <= {end_timestamp_us}
ORDER BY
user_pseudo_id, session_id, event_timestamp;
"""
try:
df = pd.read_sql(query, conn)
print(f"Fetched {len(df)} event records for range {start_date} to {end_date}.")
return df
except Exception as e:
print(f"Error fetching event data for range {start_date} to {end_date}: {e}")
return None
# FUNCTION 3: Calculate core metrics for a DataFrame
def calculate_core_metrics(df):
"""Calculates total users, sessions, and page views for a given DataFrame."""
if df is None or df.empty:
return 0, 0, 0
total_users = df['user_pseudo_id'].nunique()
total_sessions = df['session_id'].nunique()
total_page_views = df[df['event_name'] == 'page_view'].shape[0]
return total_users, total_sessions, total_page_views
# FUNCTION 4: Diagnose Traffic Drop
def diagnose_traffic_drop(conn, problem_start_date, problem_end_date, compare_start_date, compare_end_date):
"""Diagnoses traffic drop by comparing metrics between two periods."""
# STEP 4.1: Fetch data for problem period
df_problem = fetch_events_in_range(conn, problem_start_date, problem_end_date)
# STEP 4.2: Fetch data for comparison period
df_compare = fetch_events_in_range(conn, compare_start_date, compare_end_date)
if df_problem is None or df_compare is None:
print("Could not fetch data for both periods. Cannot diagnose traffic drop.")
return
# STEP 4.3: Calculate core metrics for both periods
users_problem, sessions_problem, pageviews_problem = calculate_core_metrics(df_problem)
users_compare, sessions_compare, pageviews_compare = calculate_core_metrics(df_compare)
print(f"\n--- Traffic Comparison: {problem_start_date} to {problem_end_date} (Problem Period) vs. {compare_start_date} to {compare_end_date} (Comparison Period) ---")
print(f"Metric | Problem Period | Compare Period | Change (%)")
print("-" * 60)
# Calculate and print percentage changes
def print_change(metric_name, val_problem, val_compare):
change_percent = ((val_problem - val_compare) / val_compare) * 100 if val_compare > 0 else (100 if val_problem > 0 else 0)
print(f"{metric_name:<19} | {val_problem:<14} | {val_compare:<14} | {change_percent: .2f}%")
print_change("Total Users", users_problem, users_compare)
print_change("Total Sessions", sessions_problem, sessions_compare)
print_change("Total Page Views", pageviews_problem, pageviews_compare)
# STEP 4.4: Analyze page level drop-off in problem period
print("\n--- Top Pages by Page View Drop in Problem Period ---")
# Get page views for each page in problem period
page_views_problem_df = df_problem[df_problem['event_name'] == 'page_view'].groupby('page_location').size().reset_index(name='page_views_problem')
# Get page views for each page in comparison period
page_views_compare_df = df_compare[df_compare['event_name'] == 'page_view'].groupby('page_location').size().reset_index(name='page_views_compare')
# Merge and calculate change
page_comparison = pd.merge(page_views_problem_df, page_views_compare_df, on='page_location', how='outer').fillna(0)
page_comparison['page_view_change_percent'] = ((page_comparison['page_views_problem'] - page_comparison['page_views_compare']) / page_comparison['page_views_compare']) * 100
page_comparison = page_comparison.fillna(0) # Fill NaN from division by zero for pages only in one period
# Filter for pages with significant traffic in comparison period and show largest drops
MIN_VIEWS_THRESHOLD = 10 # Only consider pages with at least 10 views in comparison period
significant_drops = page_comparison[page_comparison['page_views_compare'] >= MIN_VIEWS_THRESHOLD].sort_values('page_view_change_percent').head(10)
if not significant_drops.empty:
print(significant_drops[['page_location', 'page_views_problem', 'page_views_compare', 'page_view_change_percent']])
else:
print("No significant page view drops found with current threshold.")
# STEP 4.5: Analyze traffic source/medium drop-off in problem period
print("\n--- Top Traffic Source/Medium by Session Drop in Problem Period ---")
# Get sessions per source/medium in problem period
source_sessions_problem_df = df_problem.groupby(['traffic_source', 'traffic_medium'])['session_id'].nunique().reset_index(name='sessions_problem')
# Get sessions per source/medium in comparison period
source_sessions_compare_df = df_compare.groupby(['traffic_source', 'traffic_medium'])['session_id'].nunique().reset_index(name='sessions_compare')
# Merge and calculate change
source_comparison = pd.merge(source_sessions_problem_df, source_sessions_compare_df, on=['traffic_source', 'traffic_medium'], how='outer').fillna(0)
source_comparison['session_change_percent'] = ((source_comparison['sessions_problem'] - source_comparison['sessions_compare']) / source_comparison['sessions_compare']) * 100
source_comparison = source_comparison.fillna(0) # Fill NaN from division by zero
# Filter for sources with significant traffic in comparison period and show largest drops
MIN_SESSIONS_THRESHOLD_SOURCE = 5 # Only consider sources with at least 5 sessions in comparison period
significant_source_drops = source_comparison[source_comparison['sessions_compare'] >= MIN_SESSIONS_THRESHOLD_SOURCE].sort_values('session_change_percent').head(10)
if not significant_source_drops.empty:
print(significant_source_drops[['traffic_source', 'traffic_medium', 'sessions_problem', 'sessions_compare', 'session_change_percent']])
else:
print("No significant traffic source/medium drops found with current threshold.")
# MAIN EXECUTION 1: This block runs when the script starts
if __name__ == "__main__":
# Define your problem and comparison periods
# Adjust these dates based on when you observed a traffic drop
PROBLEM_START = datetime(2023, 10, 1).date()
PROBLEM_END = datetime(2023, 10, 7).date()
COMPARE_START = datetime(2023, 9, 24).date()
COMPARE_END = datetime(2023, 9, 30).date()
# MAIN EXECUTION 2: Connect to the database
conn = connect_to_db()
if conn:
# MAIN EXECUTION 3: Diagnose traffic drop
diagnose_traffic_drop(conn, PROBLEM_START, PROBLEM_END, COMPARE_START, COMPARE_END)
# MAIN EXECUTION 4: Close the database connection
conn.close()
print("Database connection closed.")
else:
print("Could not establish database connection. Exiting.")