# import libraries
# datetime management
import calendar
from datetime import datetime
# data management
import numpy as np
import scipy.stats as stats
from scipy.signal import find_peaks
import pandas as pd
from pandas.plotting import scatter_matrix
# graph plotting
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.font_manager import FontProperties
from matplotlib import rcParams
import seaborn as sn
from IPython.display import display, HTML
# machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import lime
import lime.lime_tabular
# warnings (to hide some useless warnings).
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
# plotly renderer settings.
import plotly.io as pio
pio.renderers.default = 'notebook'
# font settings
font_path = '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc'
font_prop = FontProperties(fname=font_path)
config = {
"font.family":'monospace',
"font.monospace": [font_prop.get_name(), 'DejaVu Sans Mono'],
"font.size": 12,
"mathtext.fontset":'stix',
"axes.unicode_minus": False,
"xtick.direction":'in',
"ytick.direction":'in',
}
rcParams.update(config)
# define dataset
df = pd.read_csv('data.csv', delimiter = ',', header = 0)
# modify dataset
df["date"] = df.datetime.apply(lambda x : x.split()[0])
df["hour"] = df.datetime.apply(lambda x : x.split()[1].split(":")[0]).astype(int)
df["weekday"] = df.date.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,"%Y-%m-%d").weekday()])
df["month"] = df.date.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,"%Y-%m-%d").month])
df['month_num'] = df['month'].astype('category').cat.codes
df["year"] = df.date.apply(lambda dateString: int(datetime.strptime(dateString, "%Y-%m-%d").year))
df["day"] = df.date.apply(lambda dateString: int(datetime.strptime(dateString, "%Y-%m-%d").day))
# define variables
datetime = df['datetime']
time = df['datetime'].str.slice(11,13)
season = df['season']
holiday = df['holiday']
workingday = df['workingday']
weather = df['weather']
temp = df['temp']
atemp = df['atemp']
humidity = df['humidity']
windspeed = df['windspeed']
casual = df['casual']
registered = df['registered']
count = df['count']
common = df['count'] - df['registered']
features = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'year', 'month_num', 'day', 'hour']
target = 'count'
X = df[features]
Y = df[target]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
models = {
"RandomForest": RandomForestRegressor(random_state=42),
"GradientBoosting": GradientBoostingRegressor(random_state=42),
"AdaBoost": AdaBoostRegressor(random_state=42),
"LinearRegression": LinearRegression()
}
# read data.
df.head()
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | date | hour | weekday | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 | 2011-01-01 | 00 | Saturday | January |
| 1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 | 2011-01-01 | 01 | Saturday | January |
| 2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 | 2011-01-01 | 02 | Saturday | January |
| 3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 3 | 10 | 13 | 2011-01-01 | 03 | Saturday | January |
| 4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 0 | 1 | 1 | 2011-01-01 | 04 | Saturday | January |
# fetch info.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10886 entries, 0 to 10885 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 10886 non-null object 1 season 10886 non-null int64 2 holiday 10886 non-null int64 3 workingday 10886 non-null int64 4 weather 10886 non-null int64 5 temp 10886 non-null float64 6 atemp 10886 non-null float64 7 humidity 10886 non-null int64 8 windspeed 10886 non-null float64 9 casual 10886 non-null int64 10 registered 10886 non-null int64 11 count 10886 non-null int64 12 date 10886 non-null object 13 hour 10886 non-null int64 14 weekday 10886 non-null object 15 month 10886 non-null object 16 month_num 10886 non-null int8 17 year 10886 non-null int64 18 day 10886 non-null int64 dtypes: float64(3), int64(11), int8(1), object(4) memory usage: 1.5+ MB
# organize general data.
general_reg_user = pd.concat([registered, common], axis=1)
general_reg_user = general_reg_user.sum()
# organize average workingday/holiday data.
workingday_user = df.groupby(['workingday']).sum()['count'].reset_index().drop(0)
holiday_user = df.groupby(['holiday']).sum()['count'].reset_index().drop(0)
avrg_data = {'Working Day': [workingday_user['count'].sum()/7412], 'Holiday': [holiday_user['count'].sum()/311]}
avrg_user = pd.DataFrame(avrg_data).sum()
# organize seasonal data
seasonal_counts = df.groupby(['season']).sum()['count'].reset_index()
# organize the DataFrame to show the count of different type of users differ from current time of the day.
user_active_time_cmp = pd.concat([time, count, registered, common], axis=1)
user_active_time_cmp_organized = user_active_time_cmp.groupby('datetime').agg({'count': 'sum', 'registered': 'sum', 0: 'sum'}).reset_index()
# by season.
user_active_season_cmp = pd.DataFrame(df.groupby(["hour","season"],sort=True)["count"].mean()).reset_index()
# by month.
sort_order = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
month_aggregated = pd.DataFrame(df.groupby("month")["count"].mean()).reset_index()
month_aggregated['month'] = pd.Categorical(month_aggregated['month'], categories=sort_order, ordered=True)
month_sorted = month_aggregated.sort_values(by='month')
# by week.
hue_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday", "Sunday"]
user_active_week_cmp = pd.DataFrame(df.groupby(["hour","weekday"],sort=True)["count"].mean()).reset_index()
user_active_week_cmp['weekday'] = pd.Categorical(user_active_week_cmp['weekday'], categories=hue_order, ordered=True)
hour_transformed = pd.melt(df[["hour", "casual", "registered"]],
id_vars=['hour'],
value_vars=['casual', 'registered'],
var_name='user_type',
value_name='user_count')
hour_aggregated = pd.DataFrame(hour_transformed.groupby(["hour", "user_type"], sort=True)["user_count"].mean()).reset_index()
hour_aggregated['total'] = hour_aggregated.groupby('hour')['user_count'].transform('sum')
这是一个散点矩阵图,用于同时可视化多个变量之间的关系。每个小格子代表两个变量之间的散点图,其中对角线上的小格子表示单个变量的密度图。这张图展示了四个变量:用户数量(count)、风速(windspeed)、温度(temp)和湿度(humidity)。
对角线上:
上三角形区域:
下三角形区域:
总结:用户数量受风速和温度影响较大,湿度对其影响较小。温度和湿度之间有一定相关性。
%config InlineBackend.figure_format = 'retina'
scatter_matrix(df[['count', 'windspeed', 'temp', 'humidity']], alpha=0.5, figsize=(10, 10), diagonal='kde')
plt.suptitle('变量的散点矩阵', fontsize = 24, fontproperties=font_prop)
plt.tight_layout()
plt.show()
这是一张热力图,显示了不同变量之间的相关性。每个单元格的颜色和数值表示两个变量之间的相关系数,范围从-1到1。以下是各列的详细分析:
总结:
%config InlineBackend.figure_format = 'retina'
corrMatt = df[["temp","atemp","casual","registered","humidity","windspeed","count"]].corr()
mask = np.triu(np.ones_like(corrMatt, dtype=bool))
htmp,ax= plt.subplots()
htmp.set_size_inches(20,10)
sn.heatmap(corrMatt, mask=mask,vmax=.8, square=True,annot=True)
<Axes: >
当我们查看饼图时,可以看到以下关键点:
这些分析提供了有关用户行为的基本概况。注册用户占比大,说明服务受欢迎程度较高,而季节性使用趋势可以帮助公司优化资源配置和推广策略。
# define subplots.
user_fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
# general user distribution.
user_fig.add_trace(go.Pie(hoverinfo='label+percent', labels=['注册用户', '未注册用户'], values=general_reg_user, hole=.3, marker=dict(colors=['#73C5C5', '#A2D9D9'])), row=1, col=1)
# average user distribution based on holiday/workingday.
user_fig.add_trace(go.Pie(hoverinfo='label+percent', labels=['节假日', '工作日'], values=avrg_user, hole=.3, marker=dict(colors=['#F6D173', '#F9E0A2'])), row=1, col=2)
# general seasonal user distribution.
user_fig.add_trace(go.Pie(labels=['春天', '夏天', '秋天', '冬天'], values=seasonal_counts['count'], textinfo='label+percent', marker=dict(colors=['#FEFAE0', '#FAEDCE', '#E0E5B6', '#CCD5AE']), hole=.3, hoverinfo='label+percent'), row=1, col=3)
# minor tweaks.
user_fig.update_layout(title_text="用户分布", showlegend=True)
此三维散点图展示的是温度、湿度与用户数量之间的关系。图中的每个点都表示一个特定时间点的观测值,颜色深浅代表了用户数量的多少,即颜色越深,用户数量越多;反之,颜色越浅,用户数量越少。
从图中可以观察到以下几点:
温度与用户数量的关系:
湿度与用户数量的关系:
温度和湿度的交互效应:
峰值区域:
综上所述,共享单车的使用量受到气温和空气湿度的双重影响。当温度适宜并且湿度适中时,用户数量达到最大值。相反,过高的温度或湿度过大会导致用户数量减少。这些发现对于制定营销策略和优化服务时间表具有重要意义,例如在预测高峰时段时考虑气象因素,从而提高车辆分配效率和服务质量。
# define scatter map
scatter_fig_wea = go.Figure(data=[go.Scatter3d(x = temp, y = humidity, z = count, mode = 'markers', marker = dict(size = count/60, line = dict(width=0), color = count, colorscale = 'Viridis', opacity = 0.8), hovertemplate='<b>温度:</b> %{x}°C<br><b>湿度:</b> %{y}%<br><b>用户数量:</b> %{z}<extra></extra>')])
scatter_fig_wea.update_layout(
scene=dict(
xaxis=dict(
title='温度 (°C)',
range=[temp.min(), temp.max()],
autorange=False
),
yaxis=dict(
title='湿度 (%)',
range=[humidity.min(), humidity.max()],
autorange=False
),
zaxis=dict(
title='用户数量',
range=[count.min(), count.max()],
autorange=False
),
aspectmode='manual',
aspectratio=dict(x=1, y=1, z=1)
),
title='温度与湿度对用户数量的影响',
showlegend = False,
hovermode = 'closest',
height=800
)
这是一个垂直条形图,横坐标代表一年中的月份,纵坐标代表用户数量。每个条形的高度对应该月份的用户活跃数。
这是一个折线图,横坐标代表一周中的每一天,纵坐标代表用户数量。每种颜色的线代表一个特定的时间段内的用户活跃情况。
这也是一个折线图,横坐标代表四个季节:春、夏、秋、冬,纵坐标代表用户数量。每种颜色的线代表一个特定的时间段内的用户活跃情况。
这是一个折线图,横坐标代表一天中的小时,纵坐标代表用户数量。每种颜色的线代表一种用户类型,最后一条红线代表所有用户类型的总和。
综上所述,这些图表为我们提供了关于用户活跃度的重要见解,帮助我们理解用户的行为模式及其潜在影响因素。通过深入分析这些数据,我们可以更好地满足用户的需求,提高用户体验和服务质量。
# First Graph.
user_active_month = px.bar(month_sorted, x = 'month', y = 'count', title='基于月份分析的用户活跃度', color='count')
user_active_month.update_xaxes(
categoryorder='array',
categoryarray=sort_order,
ticktext=['一月', '二月', '三月', '四月', '五月', '六月', '七月', '八月', '九月', '十月', '十一月', '十二月'],
tickvals=sort_order
)
user_active_month.update_traces(
hovertemplate='<b>月份:</b> %{x}<br>' +
'<b>用户数量:</b> %{y:.2f}<br>' +
'<extra></extra>'
)
# Second Graph.
user_active_week = px.line(user_active_week_cmp, x = 'hour', y = 'count', color = 'weekday', title = '基于星期分析的用户活跃度', labels={'datetime': '时间', 'value': '数值'}, markers=True)
user_active_week.update_yaxes(categoryorder='array', categoryarray=hue_order)
legend_items = user_active_week.data
sorted_legend_items = sorted(legend_items, key=lambda item: hue_order.index(item.name))
user_active_week.data = []
for item in sorted_legend_items:
user_active_week.add_trace(item)
user_active_week.for_each_trace(lambda t: t.update(
name={
'Monday': '星期一',
'Tuesday': '星期二',
'Wednesday': '星期三',
'Thursday': '星期四',
'Friday': '星期五',
'Saturday': '星期六',
'Sunday': '星期日'
}.get(t.name, t.name),
hovertemplate="<b>时间:</b> %{x}<br>" + \
"<b>用户数量:</b> %{y:.2f}<br>" + \
"<extra></extra>"
))
# Third Graph.
user_active_season = px.line(user_active_season_cmp, x = 'hour', y = 'count', color = 'season', title = '基于季节分析的用户活跃度', labels={'datetime': '时间', 'value': '数值'}, markers=True)
user_active_season.for_each_trace(lambda t: t.update(
name={
'1': '春',
'2': '夏',
'3': '秋',
'4': '冬'
}.get(t.name, t.name),
hovertemplate="<b>时间:</b> %{x}<br>" + \
"<b>用户数量:</b> %{y:.2f}<br>" + \
"<extra></extra>"
))
# Fourth Graph.
user_active_time = go.Figure()
for user_type in ['casual', 'registered']:
user_active_time.add_trace(go.Scatter(
x=hour_aggregated[hour_aggregated['user_type'] == user_type]['hour'],
y=hour_aggregated[hour_aggregated['user_type'] == user_type]['user_count'],
mode='lines+markers',
name=user_type.capitalize(),
hovertemplate='<b>时间:</b> %{x}<br>' +
'<b>用户数量:</b> %{y:.2f}<br>' +
'<extra></extra>'
))
user_active_time.for_each_trace(lambda t: t.update(
name={
'Casual': '群众',
'Registered': '注册用户'
}.get(t.name, t.name),
))
user_active_time.add_trace(go.Scatter(
x=hour_aggregated['hour'],
y=hour_aggregated['total'],
mode='lines+markers',
name='总数',
line=dict(color='red', dash='dash'),
marker=dict(color='red', size=8),
hovertemplate='<b>时间:</b> %{x}<br>' +
'<b>总数:</b> %{y:.2f}<br>' +
'<extra></extra>'
))
fig = make_subplots(rows=2, cols=2,
subplot_titles=("基于月份分析的用户活跃度", "基于星期分析的用户活跃度",
"基于季节分析的用户活跃度", "用户类型按小时的平均数量及总数"))
for trace in user_active_month.data:
fig.add_trace(trace, row=1, col=1)
for trace in user_active_week.data:
fig.add_trace(trace, row=1, col=2)
for trace in user_active_season.data:
fig.add_trace(trace, row=2, col=1)
for trace in user_active_time.data:
fig.add_trace(trace, row=2, col=2)
fig.update_layout(height=800, title_text="用户活跃度分析", coloraxis_showscale=False, legend_title_text="用户类型")
fig.update_yaxes(title_text="用户数量")
fig.update_xaxes(
title_text="月份",
categoryorder='array',
categoryarray=sort_order,
ticktext=['一月', '二月', '三月', '四月', '五月', '六月', '七月', '八月', '九月', '十月', '十一月', '十二月'],
tickvals=sort_order,
row=1,
col=1
)
fig.update_xaxes(title_text="时间 (24小时制)", row=1, col=2)
fig.update_xaxes(title_text="时间 (24小时制)", row=2, col=1)
fig.update_xaxes(title_text="时间 (24小时制)", row=2, col=2)
fig.show()
从这四张图可以看出,数据的整体分布呈现出一定的规律性。具体来说,在不同的时间维度(季节、小时、是否是工作日)下,数据的中心位置和分散程度有所不同。例如,夏季和冬季的数据分布相对更宽泛;白天特别是下午时段的数据量较多;工作日和非工作日的数据分布没有显著差别。此外,还存在一些显著的异常值,特别是在总览图和按季节分布图中,这些异常值可能是由于某些特殊事件或测量误差导致的。
fig_2 = make_subplots(rows=2, cols=2,
subplot_titles=("总览", "按季节分布", "按小时分布", "按工作日分布"))
fig_2.add_trace(go.Box(y=df['count'], name='总览'), row=1, col=1)
fig_2.add_trace(go.Box(x=df['season'], y=df['count'], name='按季节分布'), row=1, col=2)
fig_2.add_trace(go.Box(x=df['hour'], y=df['count'], name='按小时分布'), row=2, col=1)
fig_2.add_trace(go.Box(x=df['workingday'], y=df['count'], name='按工作日分布'), row=2, col=2)
fig_2.update_layout(
height=1000,
showlegend=False
)
fig_2.update_xaxes(title_text="季节", row=1, col=2)
fig_2.update_xaxes(title_text="小时", row=2, col=1)
fig_2.update_xaxes(title_text="工作日", row=2, col=2)
fig_2.update_yaxes(title_text="用户数量", col=1)
fig_2.update_yaxes(title_text="用户数量", col=2)
fig_2.show()
该图表展示了RandomForest模型在训练集和测试集上的预测结果。从图中可以看到,模型在大部分时间点上的预测值(绿线)与实际值(蓝线)非常接近,表明该模型具有良好的拟合能力。然而,在某些时间段内,预测值与实际值之间存在较大的偏差,这可能是由于模型过拟合或者数据本身存在的噪声导致的。
此图表同样展示了GradientBoosting模型在训练集和测试集上的预测情况。可以看出,虽然整体趋势与实际情况相符,但在一些细节处仍然存在一定的差异。这种现象通常意味着模型可能存在欠拟合的问题,即其复杂度不足以捕捉到所有相关模式。
对于AdaBoost模型而言,其预测曲线显示出更明显的波动性和不稳定性。这意味着尽管该方法能够有效地提高弱分类器的整体性能,但它也可能引入额外的不确定性因素,从而降低最终输出的质量。
最后一个是线性回归模型的结果。显然,线性回归作为一种简单而强大的统计技术,在许多情况下都能提供可靠的结果。不过,正如预期那样,它无法完全捕捉非线性关系的存在,因此在某些区域内的预测准确性会受到限制。
%config InlineBackend.figure_format = 'retina'
lis=['a','b','c','d']
colors = {
"真实值": "blue",
"训练预测": "green",
"测试预测": "red"
}
results = []
lime_htmls = []
col = [tuple(np.random.rand(3)) for _ in range(3)]
fig, axs = plt.subplots(2, 2, figsize=(17, 12), dpi=200)
for i, (name, model) in enumerate(models.items()):
# train models.
model.fit(X_train, Y_train)
# prediction.
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
# score.
mse_train = mean_squared_error(Y_train, y_pred_train)
mse_test = mean_squared_error(Y_test, y_pred_test)
r2_train = r2_score(Y_train, y_pred_train)
r2_test = r2_score(Y_test, y_pred_test)
results.append({
"模型": name,
"训练集的均方误差": mse_train,
"测试集的均方误差": mse_test,
"训练集的R²": r2_train,
"测试集的R²": r2_test
})
ax = axs[i // 2, i % 2]
# true values.
ax.plot(np.arange(len(X)), Y, label="真实值 (测试)", color=col[0], linewidth=2,linestyle="--")
# train prediction.
ax.plot(np.arange(len(Y_train)), y_pred_train, label=f"{name} 预测 (训练)", color=col[1], linestyle="--")
# test prediction.
ax.plot(np.arange(len(Y_train), len(Y_train) + len(Y_test)), y_pred_test, label=f"{name} 预测 (测试)", color=col[2], linestyle="--")
ax.set_title(f"({lis[i]}) {name} - 拟合与预测", fontsize=24, fontweight='semibold', fontproperties=font_prop)
ax.set_ylabel("用户数量", fontsize=18, fontweight='semibold', fontproperties=font_prop)
# tweaks.
ax.spines['left'].set_linewidth(2.5)
ax.spines['bottom'].set_linewidth(2.5)
ax.spines['top'].set_visible(True)
ax.spines['right'].set_visible(True)
ax.tick_params(axis='x', labelsize=12, width=2)
ax.tick_params(axis='y', labelsize=14, width=2.5)
# legend.
leg=ax.legend(loc='upper left',bbox_to_anchor=(0, 1), ncol=1, fontsize=12,
edgecolor='black', facecolor='white', framealpha=0)
for i, text in enumerate(leg.get_texts()):
text.set_color(col[i])
text.set_font_properties(font_prop)
# lime.
explainer = lime.lime_tabular.LimeTabularExplainer(
X_train.values, feature_names=X_train.columns, class_names=['用户数量'], verbose=True, mode='regression'
)
idx = 10
exp = explainer.explain_instance(X_test.iloc[idx], model.predict, num_features=5)
lime_html = exp.as_html()
lime_htmls.append(lime_html)
results_df = pd.DataFrame(results)
display(results_df.head())
plt.tight_layout()
plt.show()
for i, html_content in enumerate(lime_htmls):
display(HTML(f'<h2>({lis[i]}) {list(models.keys())[i]} - LIME解释</h2>'))
display(HTML(html_content))
Intercept 300.7742182546543 Prediction_local [-36.87329768] Right: 7.87 Intercept 294.525471206987 Prediction_local [-38.21660821] Right: 4.9147296338089435 Intercept 341.8157786240902 Prediction_local [11.56388283] Right: 32.13664596273292 Intercept -123.78461429089657 Prediction_local [-164919.82845724] Right: 15.386967180238571
| 模型 | 训练集的均方误差 | 测试集的均方误差 | 训练集的R² | 测试集的R² | |
|---|---|---|---|---|---|
| 0 | RandomForest | 268.033922 | 1791.554298 | 0.991819 | 0.945722 |
| 1 | GradientBoosting | 4334.617192 | 4625.437844 | 0.867690 | 0.859865 |
| 2 | AdaBoost | 11583.333863 | 11469.074163 | 0.646430 | 0.652525 |
| 3 | LinearRegression | 20034.288221 | 19903.013706 | 0.388474 | 0.397005 |