#Start by importing neccessary libraries to python
import pandas as pd
import os


#specify the path for input file
input_filepath = r'C:\\Users\\kna99391\\QUADRO\\'
#specify the name of input file
input_filename = "FACT_EVENT_DIW_Events_Detail_dropped_duplicates_v7.1.csv"


#As the dataset is large (>5 millions entries) it is better to define the datatypes of the variables in advance.
#This opmitization speeds up loading process
dtypes = {'EventID': 'Int64', 'EventNumber': 'Int64', 'EventVersion':'Int64',
          'EventDescription': str,"HE_Ref": str,
          "Works_Ref": str, 'TMA_Works_Ref': str, 'PIN': str,
          'EventType': str, 'NatureOfWorks':str ,'Impact_Group_ID': 'Int64',
          'ImpactGroupName': str, 'TMDescription': str,
          'SU Works': 'Int64',
          'DiversionInd': 'Int64', 'NarrowLanesInd': 'Int64', 'FullClosuredInd': 'Int64',
          'LaneClosureInd': 'Int64', 'Speed_Limit': 'Int64', 'Road': str, 'ClosureStartDate': str, 
          'ClosureEndDate': str, 'ActualStartInd': 'Int64','ActualEndInd': 'Int64',
          'Element_Name': str, 'NTIS_LINK_ID': 'Int64', 'StartChainage': 'Float64',
          'EndChainage': 'Float64','Length': 'Float64',
          'ClosureStartDay': str, 'ClosureEndDay': str,'ClosureStartDayIsWeekend': str,
          'ClosureEndDayIsWeekend': str,
          'ClosureStartDayIsBH' : str,'ClosureEndDayIsBH' : str,}


df = pd.read_csv(input_filepath+input_filename,dtype = dtypes)


df['ClosureStartDate']=pd.to_datetime(df['ClosureStartDate'], format='%Y-%m-%d')


df = df.sort_values(by=['ClosureStartDate'],axis=0, ascending=True)


ClosureStartDateVals= []
#ClosureStartDateVals.clear()
ClosureStartDateVals=df['ClosureStartDate'].apply(lambda x: x.toordinal()).values.tolist()


ClosureStartDayVals= []
#ClosureStartDayVals.clear()
ClosureStartDayVals=df['ClosureStartDay'].values.tolist()


StartInterval=[]
StartInterval.clear()


for i in range(len(ClosureStartDateVals)):
    #check if the date is not same as on the previous row:
    if not (ClosureStartDateVals[i]-ClosureStartDateVals[i-1]==0):
        #check if the date is not 'tomorrow' when compared against the previous row:
        if not(ClosureStartDateVals[i]-ClosureStartDateVals[i-1] ==1):
            #check if the works are not continuing after weekend:
            if not(ClosureStartDayVals[i]== 'Friday' and ClosureStartDayVals[i+1]== 'Monday'):
                #the above condtion is not sufficient, we need to make sure the dates 
                #are not going over consecutive 2 weeks:
                if not (ClosureStartDateVals[i]-ClosureStartDateVals[i-1] ==3):
                    print('Consecutive interval starts at row: ' +\
                          str(ClosureStartDateVals.index(ClosureStartDateVals[i])+2))
                    StartInterval.append(ClosureStartDateVals.index(ClosureStartDateVals[i]))


IntervalGroups = []
IntervalGroups.clear()


for i in range(len(StartInterval)):
    IntervalGroups.append('Interval_{}'.format(i+1))


Intervals = dict(zip(StartInterval, IntervalGroups))


Intervals


df["ConsecutiveInterval"]=df.index.map(Intervals).values
df["ConsecutiveInterval"]=df["ConsecutiveInterval"].fillna(method="ffill")


df["ConsecutiveInterval"]

0       Interval_1
1       Interval_1
2       Interval_1
3       Interval_1
4       Interval_1
           ...    
2335    Interval_5
2336    Interval_5
2337    Interval_5
2338    Interval_5
2339    Interval_5
Name: ConsecutiveInterval, Length: 2340, dtype: object

Group Events by series of consecutive dates¶

Create a list of start dates in ordinal format¶

Create a list of start days¶

Find on which rows consecutive time intervals starts¶

Create dictionary of row indexes and indicators of the groups intervals starts¶

Create column for interval indicator for each entry of the dataset¶