算法分析题:早高峰共享单车潮汐点的群智优化

任务一:为更好地掌握早高峰潮汐现象的变化规律与趋势,参赛者需基于主办方提供的数据进行数据分析和计算模型构建等工作,识别出工作日早高峰07:00-09:00潮汐现象最突出的40个区域,列出各区域所包含的共享单车停车点位编号名称,并提供计算方法说明及计算模型,为下一步优化措施提供辅助支撑。

In [20]:
import os, codecs
import pandas as pd
import numpy as np

%pylab inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')

from matplotlib import font_manager as fm, rcParams
import matplotlib.pyplot as plt
Populating the interactive namespace from numpy and matplotlib
In [21]:
!ls ../input/ -l
总用量 1867216
-rw-rw-r-- 1 lyz lyz     10108 1月  22 16:32 gdzdkltj_zjbh.csv
-rw-rw-r-- 1 lyz lyz      9132 1月  22 18:41 gdzdkltj_zjbh.rar
-rw-rw-r-- 1 lyz lyz    209867 1月  22 16:30 gdzdtjsj_czkl.csv
-rw-rw-r-- 1 lyz lyz    198943 1月  22 18:41 gdzdtjsj_czkl.rar
-rw-rw-r-- 1 lyz lyz    144190 1月  22 16:28 gdzdtjsj_jzkl.csv
-rw-rw-r-- 1 lyz lyz    139070 1月  22 18:41 gdzdtjsj_jzkl.rar
-rw-rw-r-- 1 lyz lyz  44673710 1月  21 13:29 gxdc_dd.csv
-rw-rw-r-- 1 lyz lyz   7803324 1月  22 13:45 gxdc_dd.rar
-rw-rw-r-- 1 lyz lyz 343512032 1月  20 09:54 gxdc_gj20201221.csv
-rw-rw-r-- 1 lyz lyz  46903519 2月   7 16:48 gxdc_gj20201221.rar
-rw-rw-r-- 1 lyz lyz 354393408 1月  20 09:54 gxdc_gj20201222.csv
-rw-rw-r-- 1 lyz lyz  48210968 2月   7 16:49 gxdc_gj20201222.rar
-rw-rw-r-- 1 lyz lyz 159761796 1月  20 09:55 gxdc_gj20201223.csv
-rw-rw-r-- 1 lyz lyz  22228271 2月   7 16:51 gxdc_gj20201223.rar
-rw-rw-r-- 1 lyz lyz 424187252 1月  20 09:55 gxdc_gj20201224.csv
-rw-rw-r-- 1 lyz lyz  56483808 2月   7 16:53 gxdc_gj20201224.rar
-rw-rw-r-- 1 lyz lyz 352361779 1月  20 09:55 gxdc_gj20201225.csv
-rw-rw-r-- 1 lyz lyz  47723946 2月   7 16:54 gxdc_gj20201225.rar
-rw-rw-r-- 1 lyz lyz   2397211 1月  21 13:23 gxdc_tcd.csv
-rw-rw-r-- 1 lyz lyz    616397 1月  22 13:45 gxdc_tcd.rar

读取数据集

In [22]:
PATH = '../input/'

def bike_fence_format(s):
    s = s.replace('[', '').replace(']', '').split(',')
    s = np.array(s).astype(float).reshape(5, -1)
    return s


# 共享单车停车点位(电子围栏)数据
bike_fence = pd.read_csv(PATH + 'gxdc_tcd.csv')
bike_fence['FENCE_LOC'] = bike_fence['FENCE_LOC'].apply(bike_fence_format)

# 共享单车订单数据
bike_order = pd.read_csv(PATH + 'gxdc_dd.csv')
bike_order = bike_order.sort_values(['BICYCLE_ID', 'UPDATE_TIME'])
In [23]:
import geohash
bike_order['geohash'] = bike_order.apply(lambda x: 
                        geohash.encode(x['LATITUDE'], x['LONGITUDE'], precision=9), axis=1)
In [24]:
from geopy.distance import geodesic

bike_fence['MIN_LATITUDE'] = bike_fence['FENCE_LOC'].apply(lambda x: np.min(x[:, 1]))
bike_fence['MAX_LATITUDE'] = bike_fence['FENCE_LOC'].apply(lambda x: np.max(x[:, 1]))

bike_fence['MIN_LONGITUDE'] = bike_fence['FENCE_LOC'].apply(lambda x: np.min(x[:, 0]))
bike_fence['MAX_LONGITUDE'] = bike_fence['FENCE_LOC'].apply(lambda x: np.max(x[:, 0]))

bike_fence['FENCE_AREA'] = bike_fence.apply(lambda x: geodesic(
    (x['MIN_LATITUDE'], x['MIN_LONGITUDE']), (x['MAX_LATITUDE'], x['MAX_LONGITUDE'])
).meters, axis=1)

bike_fence['FENCE_CENTER'] = bike_fence['FENCE_LOC'].apply(
    lambda x: np.mean(x[:-1, ::-1], 0)
)
In [25]:
import geohash
bike_order['geohash'] = bike_order.apply(
    lambda x: geohash.encode(x['LATITUDE'], x['LONGITUDE'], precision=6), 
axis=1)

bike_fence['geohash'] = bike_fence['FENCE_CENTER'].apply(
    lambda x: geohash.encode(x[0], x[1], precision=6)
)
In [26]:
# bike_order
geohash.encode(24.521156, 118.140385, precision=6), \
geohash.encode(24.521156, 118.140325, precision=6)
Out[26]:
('wsk52r', 'wsk52r')
In [27]:
bike_order['UPDATE_TIME'] = pd.to_datetime(bike_order['UPDATE_TIME'])
bike_order['DAY'] = bike_order['UPDATE_TIME'].dt.day.astype(object)
bike_order['DAY'] = bike_order['DAY'].apply(str)

bike_order['HOUR'] = bike_order['UPDATE_TIME'].dt.hour.astype(object)
bike_order['HOUR'] = bike_order['HOUR'].apply(str)
bike_order['HOUR'] = bike_order['HOUR'].str.pad(width=2,side='left',fillchar='0')

bike_order['DAY_HOUR'] = bike_order['DAY'] + bike_order['HOUR']

按照经纬度聚合

In [9]:
bike_inflow = pd.pivot_table(bike_order[bike_order['LOCK_STATUS'] == 1], 
                   values='LOCK_STATUS', index=['geohash'],
                    columns=['DAY_HOUR'], aggfunc='count', fill_value=0
)

bike_outflow = pd.pivot_table(bike_order[bike_order['LOCK_STATUS'] == 0], 
                   values='LOCK_STATUS', index=['geohash'],
                    columns=['DAY_HOUR'], aggfunc='count', fill_value=0
)
In [10]:
bike_inflow.loc['wsk52r'].plot()
bike_outflow.loc['wsk52r'].plot()
plt.xticks(list(range(bike_inflow.shape[1])), bike_inflow.columns, rotation=40)
plt.legend(['Inflow', 'OutFlow'])
Out[10]:
<matplotlib.legend.Legend at 0x7f59779350f0>
2021-02-16T21:28:33.790396 image/svg+xml Matplotlib v3.3.3, https://matplotlib.org/
In [11]:
bike_inflow.loc['wsk596'].plot()
bike_outflow.loc['wsk596'].plot()
plt.xticks(list(range(bike_inflow.shape[1])), bike_inflow.columns, rotation=40)
plt.legend(['Inflow', 'OutFlow'])
Out[11]:
<matplotlib.legend.Legend at 0x7f59778baac8>
2021-02-16T21:28:33.931016 image/svg+xml Matplotlib v3.3.3, https://matplotlib.org/
In [12]:
bike_inflow = pd.pivot_table(bike_order[bike_order['LOCK_STATUS'] == 1], 
                   values='LOCK_STATUS', index=['geohash'],
                    columns=['DAY'], aggfunc='count', fill_value=0
)

bike_outflow = pd.pivot_table(bike_order[bike_order['LOCK_STATUS'] == 0], 
                   values='LOCK_STATUS', index=['geohash'],
                    columns=['DAY'], aggfunc='count', fill_value=0
)
In [13]:
bike_remain = (bike_inflow - bike_outflow).fillna(0)
bike_remain[bike_remain < 0] = 0  
bike_remain = bike_remain.sum(1)
bike_fence['DENSITY'] = bike_fence['geohash'].map(bike_remain).fillna(0)

按照最近邻经纬度

思路: 按照订单计算与停车点的距离计算潮汐点;

In [28]:
import hnswlib
import numpy as np

p = hnswlib.Index(space='l2', dim=2)
p.init_index(max_elements=300000, ef_construction=1000, M=32)
p.set_ef(1024)
p.set_num_threads(14)

p.add_items(np.stack(bike_fence['FENCE_CENTER'].values))
In [29]:
index, dist = p.knn_query(bike_order[['LATITUDE','LONGITUDE']].values[:], k=1)
bike_order['fence'] = bike_fence.iloc[index.flatten()]['FENCE_ID'].values
In [30]:
bike_inflow = pd.pivot_table(bike_order[bike_order['LOCK_STATUS'] == 1], 
                   values='LOCK_STATUS', index=['fence'],
                    columns=['DAY'], aggfunc='count', fill_value=0
)

bike_outflow = pd.pivot_table(bike_order[bike_order['LOCK_STATUS'] == 0], 
                   values='LOCK_STATUS', index=['fence'],
                    columns=['DAY'], aggfunc='count', fill_value=0
)

bike_remain = (bike_inflow - bike_outflow).fillna(0)
bike_remain[bike_remain < 0] = 0  
bike_remain = bike_remain.sum(1)
In [31]:
# bike_fence = bike_fence.set_index('FENCE_ID')
bike_density = bike_remain / bike_fence.set_index('FENCE_ID')['FENCE_AREA']

bike_density = bike_density.sort_values(ascending=False).reset_index()
bike_density = bike_density.fillna(0)
In [32]:
bike_density['label'] = '0'
bike_density.iloc[:100, -1] = '1'

bike_density['BELONG_AREA'] ='厦门'
bike_density = bike_density.drop(0, axis=1)
In [33]:
bike_density.columns = ['FENCE_ID', 'FENCE_TYPE', 'BELONG_AREA']
bike_density.to_csv('result.txt', index=None, sep='|')
In [ ]: