04 Time Series with Regressors

# load fbprophet library
dbutils.library.installPyPI('FBProphet', version='0.5') # find latest version of fbprophet here: https://pypi.org/project/fbprophet/
dbutils.library.installPyPI('holidays', version='0.9.12') # this line is in response to this issue with fbprophet 0.5: https://github.com/facebook/prophet/issues/1293
dbutils.library.installPyPI('mlflow')
dbutils.library.restartPython()

import mlflow
import mlflow.sklearn
import shutil

from pyspark.sql.types import *
from pyspark.sql.functions import pandas_udf, PandasUDFType

import pandas as pd

import logging
logging.getLogger('py4j').setLevel(logging.ERROR)

from fbprophet import Prophet

# structure of the dataset returned by the function
result_schema =StructType([
  StructField('station_id',IntegerType()),
  StructField('ds',TimestampType()),
  StructField('y', FloatType()),
  StructField('yhat', FloatType()),
  StructField('yhat_lower', FloatType()),
  StructField('yhat_upper', FloatType()),
  StructField('trend',FloatType()),
  StructField('trend_lower', FloatType()),
  StructField('trend_upper', FloatType()),
  StructField('multiplicative_terms', FloatType()),
  StructField('multiplicative_terms_lower', FloatType()),
  StructField('multiplicative_terms_upper', FloatType()),
  StructField('daily', FloatType()),
  StructField('daily_lower', FloatType()),
  StructField('daily_upper', FloatType()),
  StructField('weekly', FloatType()),
  StructField('weekly_lower', FloatType()),
  StructField('weekly_upper', FloatType()),
  StructField('yearly', FloatType()),
  StructField('yearly_lower', FloatType()),
  StructField('yearly_upper', FloatType()),
  StructField('additive_terms', FloatType()),
  StructField('additive_terms_lower', FloatType()),
  StructField('additive_terms_upper', FloatType()),
  StructField('holidays', FloatType()),
  StructField('holidays_lower', FloatType()), 
  StructField('holidays_upper', FloatType())
  ])

# forecast function
@pandas_udf( result_schema, PandasUDFType.GROUPED_MAP )
def get_forecast(keys, group_pd):
  
  # DATA PREP
  # ---------------------------------
  # identify station id and hours to forecast
  station_id = keys[0]
  hours_to_forecast=keys[1]
  
  # extract valid historical data
  history_pd = group_pd[group_pd['is_historical']==1].dropna()
  
  # acquire holidays
  holidays_pd=holidays_broadcast.value
  # ---------------------------------  
  
  # TRAIN MODEL
  # ---------------------------------  
  # configure model
  model = Prophet(
    interval_width=0.80,
    growth='linear',
    daily_seasonality=True,
    weekly_seasonality=True,
    yearly_seasonality=True,
    holidays=holidays_pd
    )
  
  # identify the weather regressors
  model.add_regressor('temp_f', mode='multiplicative')
  model.add_regressor('precip_in', mode='multiplicative')

  # train model
  model.fit( history_pd )

  # save models for potential later use
  model_path = '/dbfs/mnt/citibike/timeseries_regressors/{0}'.format(station_id)
  shutil.rmtree(model_path, ignore_errors=True)
  mlflow.sklearn.save_model( model, model_path)
  # ---------------------------------
  
  # FORECAST
  # ---------------------------------  
  # assemble regressors
  regressors_pd = group_pd[['ds', 'temp_f', 'precip_in']]

  # assemble timeseries
  timeseries_pd = model.make_future_dataframe(
    periods=hours_to_forecast, 
    freq='H'
    )
  
  # merge timeseries with regressors to form forecast dataframe
  future_pd = timeseries_pd.merge(
    regressors_pd,
    how='left',
    on='ds',
    sort=True,
    suffixes=('_l','_r')
    )
  
  # generate forecast
  forecast_pd = model.predict(future_pd)
  # ---------------------------------
  
  # PREPARE RESULTS
  # ---------------------------------
  # merge forecast with history
  results_pd = forecast_pd.merge(
    history_pd[['ds','y']], 
    how='left', 
    on='ds',
    sort=True,
    suffixes=('_l','_r')
   )
 
  # assign station to results
  results_pd['station_id']=station_id
  # ---------------------------------
  
  return results_pd[
      ['station_id', 'ds', 
       'y', 'yhat', 'yhat_lower', 'yhat_upper',
       'trend', 'trend_lower', 'trend_upper', 
       'multiplicative_terms', 'multiplicative_terms_lower', 'multiplicative_terms_upper', 
       'daily', 'daily_lower', 'daily_upper',
       'weekly', 'weekly_lower', 'weekly_upper', 
       'yearly', 'yearly_lower', 'yearly_upper', 
       'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
       'holidays', 'holidays_lower', 'holidays_upper']
        ]

ERROR:fbprophet:Importing plotly failed. Interactive plots will not work. /databricks/spark/python/pyspark/sql/types.py:1624: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working arrow_type = pa.struct(fields)

# identify hours that should be treated as aligned with holidays
holidays_pd = spark.sql('''
    SELECT
      b.hour as ds,
      a.holiday as holiday
    FROM citibike.holidays a
    INNER JOIN citibike.periods b
      ON a.date=to_date(b.hour)
    ''').toPandas()

# replicate a copy of the holidays dataset to each node
holidays_broadcast = sc.broadcast(holidays_pd)

from pyspark.sql.functions import lit

# define number of hours to forecast
hours_to_forecast = 36

# assemble historical dataset for training
inputs = spark.sql('''
  SELECT
    a.station_id,
    a.hour as ds, 
    COALESCE(b.rentals,0) as y,
    c.avg_temp_f as temp_f,
    COALESCE(c.precip_in,0) as precip_in,
    a.is_historical  
  FROM ( -- all rental hours by currently active stations
    SELECT
      y.station_id,
      x.hour,
      CASE WHEN x.hour <= y.end_date THEN 1 ELSE 0 END as is_historical
    FROM citibike.periods x
    INNER JOIN citibike.stations_most_active y
     ON x.hour BETWEEN y.start_date AND (y.end_date + INTERVAL {0} HOURS)
    ) a
  LEFT OUTER JOIN citibike.rentals b
    ON a.station_id=b.station_id AND a.hour=b.hour
  LEFT OUTER JOIN citibike.weather c
    ON a.hour=c.hour
  '''.format(hours_to_forecast)
  )

# generate forecast
forecast = (
   inputs
    .groupBy('station_id', lit(hours_to_forecast))
    .apply(get_forecast)
  )
forecast.createOrReplaceTempView('forecast_timeseries_with_regressors')

%sql
DROP TABLE IF EXISTS citibike.forecast_timeseries_with_regressors;

CREATE TABLE citibike.forecast_timeseries_with_regressors 
USING DELTA
AS
  SELECT *
  FROM forecast_timeseries_with_regressors;

holidays_broadcast.unpersist(blocking=True)

# extract the forecast from our persisted dataset
forecast_pd = (
  spark
    .table('citibike.forecast_timeseries_with_regressors')
    .filter('station_id=518')
    ).toPandas()

# retrieve the model for this station 
model = mlflow.sklearn.load_model('/dbfs/mnt/citibike/timeseries_regressors/518')

trends_fig = model.plot_components(forecast_pd)
display(trends_fig)

from datetime import datetime

# construct a visualization of the forecast
predict_fig = model.plot(forecast_pd, xlabel='hour', ylabel='rentals')

# adjust the x-axis to focus on a limited date range
xlim = predict_fig.axes[0].get_xlim()
new_xlim = (datetime.strptime('2020-01-15','%Y-%m-%d'), datetime.strptime('2020-02-03','%Y-%m-%d'))
predict_fig.axes[0].set_xlim(new_xlim)

# display the chart
display(predict_fig)

%sql -- per station
SELECT
  e.station_id,
  e.error_sum/n as MAE,
  e.error_sum_abs/n as MAD,
  e.error_sum_sqr/n as MSE,
  POWER(e.error_sum_sqr/n, 0.5) as RMSE,
  e.error_sum_abs_prop_y/n as MAPE
FROM (
  SELECT -- error base values 
    x.station_id,
    COUNT(*) as n,
    SUM(x.yhat-x.y) as error_sum,
    SUM(ABS(x.yhat-x.y)) as error_sum_abs,
    SUM(POWER((x.yhat-x.y),2)) as error_sum_sqr,
    SUM(ABS((x.yhat-x.y)/x.y_corrected)) as error_sum_abs_prop_y,
    SUM(ABS((x.yhat-x.y)/x.yhat)) as error_sum_abs_prop_yhat,
    SUM(x.y) as sum_y,
    SUM(x.yhat) as sum_yhat
  FROM ( -- actuals vs. forecast
    SELECT
      a.station_id,
      a.ds as ds,
      CAST(COALESCE(a.y,0) as float) as y,
      CAST(COALESCE(a.y,1) as float) as y_corrected,
      a.yhat
    FROM citibike.forecast_timeseries_with_regressors a
    INNER JOIN citibike.stations b
      ON a.station_id = b.station_id AND
         a.ds <= b.end_date
     ) x
   GROUP BY x.station_id
  ) e
ORDER BY e.station_id


72	-0.002531347557845636	2.613626455975801	14.405491739440532	3.7954567234313883	0.6361909529936978
116	-0.030240065631353005	3.2164035024042557	19.359833954901973	4.399981131198403	0.537796958897676
127	-0.002383675229005273	3.853379084009329	28.82954123885075	5.369314782991472	0.7604846070279573
128	-0.011158153195404925	3.6777723429165974	25.577200572465003	5.057390688138005	0.7604348703915806
151	-0.009683172338863408	4.128176594567246	34.41638128355023	5.866547646064952	0.7952986405155138
161	-0.043645774704187526	2.8340610195977223	14.06138319977548	3.7498510903468527	0.6145931151625206
167	-0.002432018704911906	3.1012709269723846	19.414807667602428	4.406223742344734	0.65850682183287
168	-0.01783587040291986	4.164241995214933	35.47327713748607	5.955944688920984	0.7105272388032213
173	-0.00232716978678227	3.7341522317790594	27.706920945907278	5.263736405435523	0.7618224926869134
174	-0.0016526896784045575	2.586865948110474	15.04513836758969	3.878806307047271	0.6638078041957765
195	-0.0009118562728968825	2.835154540758998	17.10438731060338	4.135745073212731	0.6367696730129683
212	-0.04136259388451938	2.947314336555975	16.687992465772048	4.085093935978957	0.624992056413543
223	-0.009074134857129644	2.4958417448024823	11.559567784453488	3.399936438296088	0.5083728779916487
229	-0.02142155212948912	3.736995323581605	25.424066801793444	5.042228356767813	0.7126829412845748
236	-0.00013786211991615778	3.094429105743328	19.095967336296006	4.3698932866027755	0.7064860344542223
237	0.001410115577103157	3.079502996932018	18.00304346205743	4.242999347402428	0.6973308372419224
251	-0.005498533421623334	3.0147436391959355	17.040276012207126	4.127986920062505	0.6691440931688556
252	-0.007090359705222498	2.2123456145397085	9.032560417689563	3.0054218368957066	0.5624905994743866
257	-0.011718934240737446	2.3709298231861102	12.138010344585252	3.483964745026168	0.5828373735949307
265	-0.006870768150650735	2.5179778773120627	12.348205879555278	3.5140014057417903	0.6253770797913696
268	-0.0072722862000461005	2.4699250710744063	12.586994283077676	3.547815424043037	0.581789255955278
280	-0.00590813187342668	2.0331897574274755	7.483631172778253	2.735622629819079	0.5073707853279796
281	-0.053367689982488205	5.297326954490056	66.97208724211572	8.183647551191077	1.0063533762818244
284	-0.01931991759422874	3.9919223731056657	28.026938877117697	5.294047494792401	0.6641442543893434
285	-0.013972186747151002	4.428825811526489	36.357121627564226	6.029686693980395	0.7655146144444271
293	-0.02802105394286484	4.856656726988936	50.3295081903867	7.094329298135709	0.7936256489879947
297	-0.006626306199701921	2.962332210783801	15.875563061061326	3.984415021187091	0.6522624770830419
301	-0.001866897976934494	2.9553210467970787	21.129561289011228	4.596690253759897	0.7476807651917888
303	-0.02082129390447639	2.4123650907566745	12.937797385351205	3.596914981668486	0.5478883318408649
304	-0.025833277263347165	3.3502821960785027	30.542118618846292	5.526492433618841	0.6906712065738211
305	-0.0031226005778328117	2.7098530912126546	14.68703931733216	3.8323673254702713	0.6033485470106906
307	-0.012032451328371163	2.7756632297626287	15.2473278993824	3.9047826955392027	0.6784578435684812
312	-0.004436996814349278	2.9441735885570814	16.46435059433563	4.057628691038108	0.7302458229517977
315	-0.010865474840229126	3.2736790390109225	22.785534671714803	4.773419599376824	0.6275117704847409
317	-0.0019685411543564225	2.6715105927655243	15.402053289958118	3.924544978715127	0.6787382587328026
319	-0.029905906808747353	2.5665184070205647	14.224318921537552	3.7715141417655524	0.5153515085839753
325	-0.0008486547020563651	2.3501818005442634	10.620744940783597	3.258948440952019	0.6131091860456089
326	-0.002294261983188061	2.7549494028301846	14.308266535445583	3.782626935800778	0.6636380376051059
327	-0.008960753356627249	4.658858728674081	43.8018930047023	6.618299857569337	0.848151561331495
328	-0.005914366451596255	2.3418288953647024	10.869739043987376	3.2969287289820772	0.5638366035893823
334	-0.005076296451548385	2.901405687411922	16.135945960950014	4.016957301360074	0.6376599581918272
335	-0.0019693278365379304	2.4345832210035097	10.956818439466623	3.3101085238201216	0.5692528912866123
336	-0.007384266757869443	2.6019175773220766	12.160843450566428	3.4872400907546397	0.6264264740823718
346	-0.0027514353376695048	2.347011121711023	10.317704870240107	3.2121184396345206	0.5888388294690834
347	-0.035358683150688484	3.629292736037329	27.83699759742392	5.276077861198025	0.7042469495839689
355	-0.003976747591214596	2.0612402891399513	7.8556674792934835	2.8027963677894054	0.5325486237169635
358	-0.011092033560011408	4.824169315690827	50.29153051691272	7.091652171173705	0.8988950185010973
359	-0.016747502655301325	6.792028880520577	146.32072757400192	12.096310494278903	1.2555987067696455
361	-0.002314137324846112	2.585864787758208	13.031826281092407	3.6099620885948935	0.6463443132518922
363	-0.012821678032396559	2.9069903494206626	17.007521712429224	4.124017666357556	0.638044511687129
368	-0.00539460679837331	4.0616541452215165	29.912872481532006	5.469266173951676	0.7653904547617517
377	-0.024784656433050365	3.118913318278197	27.819120918202053	5.274383463325552	0.7314091163511408
379	-0.008385939036210552	4.908706417477347	47.796678739095995	6.91351421052246	1.009219462576605
380	-0.0044831286139292836	3.1792049750736897	18.368754278595656	4.285878472214962	0.6965561045509492
383	-0.0047119465588462296	2.626206018115895	12.69827836408664	3.563464376710765	0.6053348376128355
387	-0.03334578009406939	4.18594741799845	37.981012130181945	6.162873690915784	0.839384774802301
388	-0.011233980885833874	3.2795664098048403	24.268869072942557	4.926344392441779	0.7144313022780764
394	0.0012422334861980383	2.5341581601239755	14.394975814834368	3.7940711399279756	0.657976817741848
401	-0.0012197207314779566	2.852207764189575	15.467306793521146	3.9328497039069705	0.6705478563958467
402	-0.017365802262732346	6.442482831087864	120.58367903150203	10.981060014019686	1.0142440741572567
403	-0.011704246507891976	2.433055616281971	11.631557892498474	3.4105069846722897	0.5299114666114819
405	-0.05930449402966646	3.497186688627303	22.90163333679363	4.785565101092412	0.7235631800623961
410	-0.0005716500404116396	2.7362063977896853	15.06050194383791	3.8807862533045943	0.6799055417634492
412	-0.010056964412831822	2.27901799410694	9.850495723604459	3.1385499396384406	0.5781037631629878
417	-0.00967307496232281	3.747702671893599	28.36503142393055	5.325883159057336	0.7897328947653961
426	-0.055755589916519556	6.561911115711492	90.20200168268472	9.497473436798058	1.0683238168638738
432	-0.002947030369156658	3.8706602425774683	35.346577046631296	5.945298734851874	0.8545895919522826
434	-0.021262961033778838	2.6791695751337867	14.464675057203772	3.803245332239793	0.509007950534333
435	-0.006187155251073998	5.0122682430868375	47.85600437481596	6.917803435687946	0.7803511072477368
438	-0.010144655149176583	2.962832592466057	20.700171325449787	4.549744094501337	0.7200854367162124
439	-0.026473302406487792	2.78063716680232	15.000834380805626	3.873091062808313	0.6136420323503864
442	-0.006213577626320721	3.8528974948683596	28.821562720281754	5.3685717579521794	0.7670382704372806
445	-0.0011809207198825433	3.9255524437009632	38.77374884279516	6.22685705976901	0.8611300558078194
446	-0.00632100408518551	3.3611648427971064	21.87723187344444	4.677310324689227	0.6528604120406366
447	-0.0013513696138358173	2.6553224609395483	13.386755941274009	3.658791595769566	0.6489227963910184
448	-0.004802924115968531	2.6649392820081568	13.696354140846239	3.7008585680685284	0.6068651936731355
450	-0.01040894547738824	3.1430961234303076	18.801526265018165	4.336072677552599	0.7030283292240725
453	-0.008759153266762645	3.128709886953195	19.002643399184215	4.3592021516768655	0.6893744348511887
458	-0.027512061520518574	3.4028557133254336	24.922145696565817	4.9922084989076545	0.7515230005561051
459	-0.0178485402595617	5.101777709896393	49.68090688808622	7.048468407256021	0.7641649337667126
461	-0.0025528382955487434	3.2918059581109778	22.905803760408112	4.786000810740436	0.7196394707013432
462	-0.011940626082223835	3.644084666577598	24.886528971206456	4.988639992142794	0.6724631906400895
465	-0.002597112110060708	4.498055076801325	42.40712214085924	6.512075102519875	0.893229993767752
466	-0.00056088507082982	3.2955196000710103	21.03365600779368	4.58624639632387	0.7044385402002165
468	0.0006032910180323503	3.5531593261878167	26.474774229776983	5.145364343734755	0.7415200978419503
469	-0.006431304105362544	2.784539143624359	19.998549881416615	4.471973823874265	0.6518297577099936
470	-0.006172499858723058	2.648278356019101	13.621763848781653	3.6907673793916698	0.6355564022192369
472	-0.0026233062020750305	3.7214542724160578	28.124954995475456	5.303296615830144	0.7848058841542775
474	-0.01988754020194239	3.17351667300879	20.552932908240013	4.53353426238735	0.6563924783029743
476	-0.00645454576404887	2.735493502840633	14.779446745590684	3.844404602222649	0.6465896830331842
477	-0.011510715065527167	5.592295808553558	70.2655312388653	8.382453771949196	1.0961231202777917
478	-0.005215629696787815	2.77533821891115	15.097534458104935	3.8855545882286786	0.7285078789275449
479	-0.005732708078375045	3.1516532671916155	18.959668886697568	4.354270189905257	0.7213536982665689
480	-0.00048456689128609026	2.3523042084628885	10.852825014951957	3.2943626113334816	0.5698936362246023
482	-0.013353552705775602	3.1168064147714403	18.4952608270861	4.300611680573602	0.6807919152964258
483	-0.0036590732469390457	3.064540813841508	17.51296511952588	4.184849473938804	0.6150100681397718
484	-0.006683275120529245	3.2590466114360646	26.031933294374504	5.102149869846485	0.6191094501292969
485	-0.005801976501673938	2.413831320713698	11.598636953874987	3.405677165245553	0.5508172257880294
486	-0.010712428125813994	3.4673686403169715	23.13089464849844	4.8094588727317795	0.7497880073546925
487	-0.0036161730987600217	3.0767394806061703	22.066794301790956	4.697530660016064	0.7486048295663995

%sql -- all stations

SELECT
  e.error_sum/n as MAE,
  e.error_sum_abs/n as MAD,
  e.error_sum_sqr/n as MSE,
  POWER(e.error_sum_sqr/n, 0.5) as RMSE,
  e.error_sum_abs_prop_y/n as MAPE
FROM (
  SELECT -- error base values 
    COUNT(*) as n,
    SUM(x.yhat-x.y) as error_sum,
    SUM(ABS(x.yhat-x.y)) as error_sum_abs,
    SUM(POWER((x.yhat-x.y),2)) as error_sum_sqr,
    SUM(ABS((x.yhat-x.y)/x.y_corrected)) as error_sum_abs_prop_y,
    SUM(ABS((x.yhat-x.y)/x.yhat)) as error_sum_abs_prop_yhat,
    SUM(x.y) as sum_y,
    SUM(x.yhat) as sum_yhat
  FROM ( -- actuals vs. forecast
    SELECT
      a.ds as ds,
      CAST(COALESCE(a.y,0) as float) as y,
      CAST(COALESCE(a.y,1) as float) as y_corrected,
      a.yhat
    FROM citibike.forecast_timeseries_with_regressors a
    INNER JOIN citibike.stations b
      ON a.station_id = b.station_id AND
         a.ds <= b.end_date
     ) x
  ) e

Forecasting using Time Series Analysis with Weather Regressors