item_prediction_data.head()


item_prediction_data.describe()


item_prediction_data_with_reviews.head()


item_prediction_data_with_reviews.describe()


# A list of models to try for our the recommender system
algos = [
    SVD,
    SVDpp,
    BaselineOnly,
    SlopeOne,
    NMF,
    NormalPredictor,
    CoClustering,
]
# Algorithms where hyperparameter optimization should be done
algorithms_to_do_grid_search = set(["SVD", "SVDpp"])

# Rating scale
reader = Reader(rating_scale=(0, 9))

# A function that trains and benchmarks an algorithm
def train_model(df, algorithms_to_implement=algos, cv=5):
    # Load data into supreme dataset
    data = Dataset.load_from_df(df, reader)
    # A list to keep the results
    benchmark = []

    # Get results
    for algorithm in algorithms_to_implement:
        # Get the name of the algorithm
        algorithm_name = str(algorithm).split(".")[-1].split("'")[0]
        print(f"Starting {algorithm_name}")
        # Start a timer
        start_time = time.time()
        if algorithm_name not in algorithms_to_do_grid_search:
            # If the algorithm doesn't require hyper param tunning train with cross-validation
            results = cross_validate(
                algorithm(), data, measures=['rmse'], cv=cv, verbose=False
            )
            tmp = DataFrame.from_dict(results).mean(axis=0)
        else:
            # Parameters to try for the hyperparameter tunning
            param_grid = {'n_factors':[50,100,150],'n_epochs':[20,30],  'lr_all':[0.005,0.01],'reg_all':[0.02,0.1]}
            
            # Do grid hyperparameter tunning for the model
            gs = GridSearchCV(algorithm, param_grid, measures=['rmse'], cv=3)

            # fit the data
            gs.fit(data)
            params = gs.best_params['rmse']

            # Keep the best model
            results = {
                "test_rmse": [gs.best_score['rmse']],
                **{k: [v] for k, v in gs.best_params['rmse'].items()},
            }

            tmp = DataFrame.from_dict(results).mean(axis=0)
        # End timer
        end_time = time.time()
        # Add the algorithm to the benchmarks list
        tmp = tmp.append(Series([algorithm_name], index=["Algorithm"]))
        benchmark.append(tmp)

        print(
            f"Finished {algorithm_name} total execution time: {end_time - start_time} seconds \n"
        )

    # Return the results, sorted from best to worst algorithm
    return DataFrame(benchmark).set_index("Algorithm").sort_values("test_rmse")


# set data
recommender_data = (
    item_prediction_data_with_reviews.groupby(["customer_unique_id", "product_id"])[
        "review_score"
    ]
    .agg(["mean"])
    .reset_index()
)

# Split train test data (85% train, 15% test)
train, evaluate = sklearn_train_test_split(
    recommender_data, random_state=42, test_size=0.15,
)
# Train all the models
estimations = train_model(train)

Starting SVD
Finished SVD total execution time: 196.94794082641602 seconds 

Starting SVDpp
Finished SVDpp total execution time: 706.4564650058746 seconds 

Starting BaselineOnly
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Finished BaselineOnly total execution time: 3.300546169281006 seconds 

Starting SlopeOne
Finished SlopeOne total execution time: 78.1140968799591 seconds 

Starting NMF
Finished NMF total execution time: 38.525776386260986 seconds 

Starting NormalPredictor
Finished NormalPredictor total execution time: 2.101433753967285 seconds 

Starting CoClustering
Finished CoClustering total execution time: 37.170769453048706 seconds


# Print results
estimations.head(len(algos))

	product_id	product_category_name	en_product_category_name	order_id	order_item_id	seller_id	price	customer_id	customer_unique_id	customer_zip_code_prefix	customer_city	customer_state
0	1e9e8ef04dbcff4541ed26657ea517e5	perfumaria	perfumery	e17e4f88e31525f7deef66779844ddce	1.0	5670f4db5b62c43d542e1b2d56b0cf7c	10.91	f8a3e963a310aa58b60a5b1fed5bceb5	b1a1199364a4a7fe27c4486ab63f550d	13848	mogi-guacu	SP
1	3aa071139cb16b67ca9e5dea641aaa2f	artes	art	5236307716393b7114b53ee991f36956	1.0	b561927807645834b59ef0d16ba55a24	248.00	03fc97548af8f58fefc768d12b546c9c	4b86049cb99e4aa774031daa9cd18f18	20551	rio de janeiro	RJ
2	96bd76ec8810374ed1b65e291975717f	esporte_lazer	sports_leisure	01f66e58769f84129811d43eefd187fb	1.0	7b07b3c7487f0ea825fc6df75abd658b	79.80	e41819d1c95c12c9ce495b630eab8aee	f63805d9c7edb84d92413af34b86a39c	5821	sao paulo	SP
3	cef67bcfe19066a932b7673e239eb23d	bebes	baby	143d00a4f2dde4e0364ee1821577adb3	1.0	c510bc1718f0f2961eaa42a23330681a	112.30	322162b5ca010c2b059cb5224dd818b1	619e926d09b26efbd5180368b1ddc874	2018	sao paulo	SP
4	9dc1a7de274444849c219cff195d0b71	utilidades_domesticas	housewares	86cafb8794cb99a9b1b77fc8e48fbbbb	1.0	0be8ff43f22e456b4e0371b2245e4d01	37.90	c11c31965ff02cc1d7132df8edfcbc22	ad353b4fb0e294adc4eda48af73e68a6	5835	sao paulo	SP

	order_item_id	price	customer_zip_code_prefix
count	112650.000000	112650.000000	113425.000000
mean	1.197834	120.653739	35102.472965
std	0.705124	183.633928	29864.919733
min	1.000000	0.850000	1003.000000
25%	1.000000	39.900000	11250.000000
50%	1.000000	74.990000	24320.000000
75%	1.000000	134.900000	59020.000000
max	21.000000	6735.000000	99990.000000

	review_id	order_id	review_score	review_comment_title	review_comment_message	review_creation_date	review_answer_timestamp	review_comment_message_processed	product_id	product_category_name	en_product_category_name	order_item_id	seller_id	price	customer_id	customer_unique_id	customer_zip_code_prefix	customer_city	customer_state
0	7bc2406110b926393aa56f80a40eba40	73fc7af87114b39712e6da79b0a377eb	4	NaN	nan	2018-01-18 00:00:00	2018-01-18 21:46:59	nan	fd25ab760bfbba13c198fa3b4f1a0cd3	esporte_lazer	sports_leisure	1.0	6d803cb79cc31c41c4c789a75933b3c7	185.00	41dcb106f807e993532d446263290104	68a5590b9926689be4e10f4ae2db21a8	6030	osasco	SP
1	7bc2406110b926393aa56f80a40eba40	73fc7af87114b39712e6da79b0a377eb	4	NaN	nan	2018-01-18 00:00:00	2018-01-18 21:46:59	nan	fd25ab760bfbba13c198fa3b4f1a0cd3	esporte_lazer	sports_leisure	2.0	6d803cb79cc31c41c4c789a75933b3c7	185.00	41dcb106f807e993532d446263290104	68a5590b9926689be4e10f4ae2db21a8	6030	osasco	SP
2	80e641a11e56f04c1ad469d5645fdfde	a548910a1c6147796b98fdf73dbeba33	5	NaN	nan	2018-03-10 00:00:00	2018-03-11 03:05:13	nan	be0dbdc3d67d55727a65d4cd696ca73c	informatica_acessorios	computers_accessories	1.0	8e6d7754bc7e0f22c96d255ebda59eba	79.79	8a2e7ef9053dea531e4dc76bd6d853e6	64190b91b656ab8f37eb89b93dc84584	13380	nova odessa	SP
3	228ce5500dc1d8e020d8d1322874b6f0	f9e4b658b201a9f2ecdecbb34bed034b	5	NaN	nan	2018-02-17 00:00:00	2018-02-18 14:36:24	nan	d1c427060a0f73f6b889a5c7c61f2ac4	informatica_acessorios	computers_accessories	1.0	a1043bafd471dff536d0c462352beb48	149.00	e226dfed6544df5b7b87a48208690feb	1d47144362c14e94ccdd213e8ec277d5	44571	santo antonio de jesus	BA
4	e64fb393e7b32834bb789ff8bb30750e	658677c97b385a9be170737859d3511b	5	NaN	Recebi bem antes do prazo estipulado.	2017-04-21 00:00:00	2017-04-21 22:02:06	recebi ante do prazo estipulado	52c80cedd4e90108bf4fa6a206ef6b03	ferramentas_jardim	garden_tools	1.0	a1043bafd471dff536d0c462352beb48	179.99	de6dff97e5f1ba84a3cd9a3bc97df5f6	c8cf6cb6b838dc7a33ed199b825e8616	88735	gravatal	SC

	review_score	order_item_id	price	customer_zip_code_prefix
count	114100.000000	113322.000000	113322.000000	114100.000000
mean	3.999816	1.198514	120.481328	35105.746450
std	1.412251	0.706993	183.277636	29868.322786
min	1.000000	1.000000	0.850000	1003.000000
25%	3.000000	1.000000	39.900000	11250.000000
50%	5.000000	1.000000	74.900000	24320.000000
75%	5.000000	1.000000	134.900000	59022.000000
max	5.000000	21.000000	6735.000000	99990.000000

	test_rmse	n_factors	n_epochs	lr_all	reg_all	fit_time	test_time
Algorithm
SlopeOne	1.322336	NaN	NaN	NaN	NaN	14.499403	0.175537
SVDpp	1.331925	50.0	30.0	0.01	0.1	NaN	NaN
SVD	1.333714	50.0	30.0	0.01	0.1	NaN	NaN
BaselineOnly	1.340839	NaN	NaN	NaN	NaN	0.319130	0.101361
NMF	1.340940	NaN	NaN	NaN	NaN	7.372755	0.081810
CoClustering	1.341078	NaN	NaN	NaN	NaN	7.132974	0.092079
NormalPredictor	1.927209	NaN	NaN	NaN	NaN	0.084245	0.135720

4 - Recommendation System | Selection Of The Proper Algorithm¶

In order for Olist to achieve better sales the usage of a recommendation algorithm is necessary.¶

SVDpp or SVD++ refers to a matrix factorization model which makes use of implicit feedback information. In general, implicit feedback can refer to any kinds of users' history information that can help indicate users' preference.¶

4.1 - Merge datasets and drop unneeded columns¶

4.2 - Evaluate all the available models¶

We are now going to evaluate the 5 different recommender algorithms in order to find the best one for our purposes.¶

We will use the review score as the score for the recommendation system and then we proceed to the training¶