For the project QUADRO I was working on during my time in Motts, I have encountered a challenge where I needed to figure out the way to specifically sort traffic management events from National Highways database by grouping specific events into series of consecutive dates, taking into consideration if the works continue next week after the weekend. I have developed a simple python utility to aid me in this task.
#Start by importing neccessary libraries to python
import pandas as pd
import os
#specify the path for input file
input_filepath = r'C:\\Users\\kna99391\\QUADRO\\'
#specify the name of input file
input_filename = "FACT_EVENT_DIW_Events_Detail_dropped_duplicates_v7.1.csv"
#As the dataset is large (>5 millions entries) it is better to define the datatypes of the variables in advance.
#This opmitization speeds up loading process
dtypes = {'EventID': 'Int64', 'EventNumber': 'Int64', 'EventVersion':'Int64',
'EventDescription': str,"HE_Ref": str,
"Works_Ref": str, 'TMA_Works_Ref': str, 'PIN': str,
'EventType': str, 'NatureOfWorks':str ,'Impact_Group_ID': 'Int64',
'ImpactGroupName': str, 'TMDescription': str,
'SU Works': 'Int64',
'DiversionInd': 'Int64', 'NarrowLanesInd': 'Int64', 'FullClosuredInd': 'Int64',
'LaneClosureInd': 'Int64', 'Speed_Limit': 'Int64', 'Road': str, 'ClosureStartDate': str,
'ClosureEndDate': str, 'ActualStartInd': 'Int64','ActualEndInd': 'Int64',
'Element_Name': str, 'NTIS_LINK_ID': 'Int64', 'StartChainage': 'Float64',
'EndChainage': 'Float64','Length': 'Float64',
'ClosureStartDay': str, 'ClosureEndDay': str,'ClosureStartDayIsWeekend': str,
'ClosureEndDayIsWeekend': str,
'ClosureStartDayIsBH' : str,'ClosureEndDayIsBH' : str,}
df = pd.read_csv(input_filepath+input_filename,dtype = dtypes)
df['ClosureStartDate']=pd.to_datetime(df['ClosureStartDate'], format='%Y-%m-%d')
df = df.sort_values(by=['ClosureStartDate'],axis=0, ascending=True)
Ordinal format is suitable to calculate difference between dates
ClosureStartDateVals= []
#ClosureStartDateVals.clear()
ClosureStartDateVals=df['ClosureStartDate'].apply(lambda x: x.toordinal()).values.tolist()
ClosureStartDayVals= []
#ClosureStartDayVals.clear()
ClosureStartDayVals=df['ClosureStartDay'].values.tolist()
StartInterval=[]
StartInterval.clear()
for i in range(len(ClosureStartDateVals)):
#check if the date is not same as on the previous row:
if not (ClosureStartDateVals[i]-ClosureStartDateVals[i-1]==0):
#check if the date is not 'tomorrow' when compared against the previous row:
if not(ClosureStartDateVals[i]-ClosureStartDateVals[i-1] ==1):
#check if the works are not continuing after weekend:
if not(ClosureStartDayVals[i]== 'Friday' and ClosureStartDayVals[i+1]== 'Monday'):
#the above condtion is not sufficient, we need to make sure the dates
#are not going over consecutive 2 weeks:
if not (ClosureStartDateVals[i]-ClosureStartDateVals[i-1] ==3):
print('Consecutive interval starts at row: ' +\
str(ClosureStartDateVals.index(ClosureStartDateVals[i])+2))
StartInterval.append(ClosureStartDateVals.index(ClosureStartDateVals[i]))
IntervalGroups = []
IntervalGroups.clear()
for i in range(len(StartInterval)):
IntervalGroups.append('Interval_{}'.format(i+1))
Intervals = dict(zip(StartInterval, IntervalGroups))
Intervals
df["ConsecutiveInterval"]=df.index.map(Intervals).values
df["ConsecutiveInterval"]=df["ConsecutiveInterval"].fillna(method="ffill")
df["ConsecutiveInterval"]
0 Interval_1
1 Interval_1
2 Interval_1
3 Interval_1
4 Interval_1
...
2335 Interval_5
2336 Interval_5
2337 Interval_5
2338 Interval_5
2339 Interval_5
Name: ConsecutiveInterval, Length: 2340, dtype: object