# Read tweets data
tweets = pd.read_csv("data/tweets/NoThemeTweets.csv", sep=",")


tweets.head()


# Re-label of sentiment
tweets["sentiment"] = tweets["sentiment"].replace({"Positivo": 1, "Negativo": 0})

# NLP
tweets['tweet_text'] = tweets['tweet_text'].parallel_apply(cleanup_text)


# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = ['Positive', 'Negative']
sizes = [len(tweets[tweets['sentiment']==1]), len(tweets[tweets['sentiment']==0])]


fig, ax = plt.subplots()
fig.set_size_inches(10, 7)
ax.pie(sizes, labels=labels, autopct='%1.1f%%')
ax.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
ax.set_title('Sentiment propotions')

plt.show()

print(f"Total positive tweets in the dataset: {len(tweets[tweets['sentiment']==1])}")
print(f"Total negative tweets in the dataset: {len(tweets[tweets['sentiment']==0])}")

Total positive tweets in the dataset: 263107
Total negative tweets in the dataset: 522707


tweets.drop( tweets[tweets["sentiment"] == 0].head(259600).index, inplace=True)
print(f"Total positive tweets in the dataset: {len(tweets[tweets['sentiment']==1])}")
print(f"Total negative tweets in the dataset: {len(tweets[tweets['sentiment']==0])}")

Total positive tweets in the dataset: 263107
Total negative tweets in the dataset: 263107


# Initialize the tf-idf vectorizer
tfidfconverter = TfidfVectorizer(stop_words=stopwords.words("portuguese"))
# Transform each word in the tweets into their tf-idf format
X_sentiment = tfidfconverter.fit_transform(tweets["tweet_text"])
# Use the sentiment column as our target
y_sentiment = tweets["sentiment"]
# Keep 80% of the data for train and 20% of the data for test
X_train_sent, X_test_sent, y_train_sent, y_test_sent = sklearn_train_test_split(
    X_sentiment, y_sentiment, test_size=0.2, random_state=0
)

# The path that the final model is stored
sent_model_path = "data/sentiment_analysis.model"
# Check if the model exists on the disk, if it is then load it, otherwise train it
if exists(sent_model_path):
    # Load model from disk
    sentiment_model = load(sent_model_path)
else:
    # Use a random forest classifier to predict sentimental analysis
    sentiment_model = RandomForestClassifier(
        n_estimators=100, random_state=0, n_jobs=-1
    )
    # Train the model
    sentiment_model.fit(X_train_sent, y_train_sent)
    # Save model to root directory of the notebook
    dump(sentiment_model, sent_model_path)


predictions = sentiment_model.predict(X_test_sent)

#  Print scores for the model
[
    print(f"{method.__name__}:\n", method(y_test_sent, predictions))
    for method in [classification_report, accuracy_score, confusion_matrix]
]
print("Confusion matrix normalized over true predictions")
display(
    plot_confusion_matrix(sentiment_model, X_test_sent, y_test_sent, normalize="true")
)

classification_report:
               precision    recall  f1-score   support

           0       0.75      0.77      0.76     52650
           1       0.77      0.74      0.75     52593

    accuracy                           0.76    105243
   macro avg       0.76      0.76      0.76    105243
weighted avg       0.76      0.76      0.76    105243

accuracy_score:
 0.7581311821213762
confusion_matrix:
 [[40755 11895]
 [13560 39033]]
Confusion matrix normalized over true predictions

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f6f5729a2e8>


item_prediction_data = customers_data.merge(orders_data, how="outer", on="customer_id")
item_prediction_data.drop(
    columns=[
        "order_purchase_timestamp",
        "order_approved_at",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "order_estimated_delivery_date",
        "order_status",
    ],
    inplace=True,
)
item_prediction_data = order_items_data.merge(
    item_prediction_data, how="outer", on="order_id"
)
item_prediction_data = product_data.merge(
    item_prediction_data, how="outer", on="product_id"
)
item_prediction_data.drop(
    columns=[
        "shipping_limit_date",
        "freight_value",
        "product_name_lenght",
        "product_description_lenght",
        "product_photos_qty",
        "product_weight_g",
        "product_length_cm",
        "product_height_cm",
        "product_width_cm",
    ],
    inplace=True,
)
item_prediction_data_with_reviews = order_reviews_data.merge(
    item_prediction_data, how="outer", on="order_id"
)
item_prediction_data_with_reviews.head()


# Make a copy of the original data
sentiment_analysis_data = item_prediction_data_with_reviews.copy()

# Merge review titles with review messages to maintain all the info
def combine_reviews(row):
    if pd.isna(row["review_comment_title"]):
        return row["review_comment_message"]
    if pd.isna(row["review_comment_message"]):
        return row["review_comment_title"]
    return f'{row["review_comment_title"]}{" " if row["review_comment_title"].endswith(".") \
                                                else ". "}{row["review_comment_message"]}'


sentiment_analysis_data[
    "reviews_including_titles"
] = sentiment_analysis_data.parallel_apply(combine_reviews, axis=1)

# Print count for null rows
display(sentiment_analysis_data.isnull().sum())

review_id                                0
order_id                                 0
review_score                             0
review_comment_title                100386
review_comment_message                   0
review_creation_date                     0
review_answer_timestamp                  0
review_comment_message_processed         0
product_id                             778
product_category_name                 2390
en_product_category_name              2390
order_item_id                          778
seller_id                              778
price                                  778
customer_id                              0
customer_unique_id                       0
customer_zip_code_prefix                 0
customer_city                            0
customer_state                           0
reviews_including_titles                 0
dtype: int64


# Drop null rows
sentiment_analysis_data.dropna(subset=["reviews_including_titles"], inplace=True)

# Print example text
for i in range(10):
    print(
        f'Review {i+1}: {np.random.choice(sentiment_analysis_data["reviews_including_titles"])}'
    )

Review 1: nan
Review 2: nan
Review 3: nan
Review 4: nan
Review 5: produto entregue no prazo
Review 6: nan
Review 7: nan
Review 8: nan
Review 9: nan
Review 10: nan


sentiment_analysis_data["reviews_including_titles"] = sentiment_analysis_data[
    "reviews_including_titles"
].parallel_apply(cleanup_text)


def review_score_to_sentimantal(score):
    if score > 3:
        return 1
    if score < 3:
        return 0
    return None


# Turn review scores into sentimantal scores
sentiment_analysis_data["sentiment_from_score"] = sentiment_analysis_data[
    "review_score"
].parallel_apply(review_score_to_sentimantal)
# Drop nutral scores
sentiment_analysis_data.dropna(subset=["sentiment_from_score"], inplace=True)
# Make sure that the scores are stored as integers
sentiment_analysis_data["sentiment_from_score"] = sentiment_analysis_data[
    "sentiment_from_score"
].astype(int)
sentiment_analysis_data


# Turn text reviews into their TF-IDF format
reviews_tfidf = tfidfconverter.transform(
    sentiment_analysis_data["reviews_including_titles"]
)


sentiment_analysis_data["sentiment_prediction"] = sentiment_model.predict(reviews_tfidf)


possible_misc_data = sentiment_analysis_data[
    sentiment_analysis_data["sentiment_prediction"]
    > sentiment_analysis_data["sentiment_from_score"]
][
    [
        "review_comment_message",
        "review_score",
        "sentiment_from_score",
        "sentiment_prediction",
    ]
].groupby(
    "review_comment_message"
)

pred_number = len(possible_misc_data.groups)
actual_number = len(order_reviews_data)
p_score = round(100 - ((pred_number / actual_number) * 100), 2)

# Print the amount of comments that might be misclassified
print(
    f"Amount of comments that might been wrongly predicted: {pred_number} of {actual_number} total and score {p_score}%",
    "\n",
)

Amount of comments that might been wrongly predicted: 8785 of 100000 total and score 91.22%


# Print last 2 messages
for k, v in list(possible_misc_data)[-2:]:
    display(v.head(1))
    print(f"Full review for id {v.index[0]}:", k, "\n", "\n", "\n")

Full review for id 9668: ótimo e recomendo nota oito e meio.

Full review for id 104247: 😡😡😡😡😡👎👎👎👎👎
Empresa sem compromisso com o cliente

	id	tweet_text	tweet_date	sentiment	query_used
0	1031761728445530112	@Tixaa23 14 para eu ir :)	Tue Aug 21 04:35:39 +0000 2018	Positivo	:)
1	1031761040462278656	@drexalvarez O meu like eu já dei na época :)	Tue Aug 21 04:32:55 +0000 2018	Positivo	:)
2	1031760962372689920	Eu só queria conseguir comer alguma coisa pra ...	Tue Aug 21 04:32:37 +0000 2018	Positivo	:)
3	1031760948250456066	:D que lindo dia !	Tue Aug 21 04:32:33 +0000 2018	Positivo	:)
4	1031760895985246208	@Primo_Resmungao Pq da pr jeito!!é uma "oferta...	Tue Aug 21 04:32:21 +0000 2018	Positivo	:)

	review_id	order_id	review_score	review_comment_title	review_comment_message	review_creation_date	review_answer_timestamp	review_comment_message_processed	product_id	product_category_name	en_product_category_name	order_item_id	seller_id	price	customer_id	customer_unique_id	customer_zip_code_prefix	customer_city	customer_state
0	7bc2406110b926393aa56f80a40eba40	73fc7af87114b39712e6da79b0a377eb	4	NaN	nan	2018-01-18 00:00:00	2018-01-18 21:46:59	nan	fd25ab760bfbba13c198fa3b4f1a0cd3	esporte_lazer	sports_leisure	1.0	6d803cb79cc31c41c4c789a75933b3c7	185.00	41dcb106f807e993532d446263290104	68a5590b9926689be4e10f4ae2db21a8	6030	osasco	SP
1	7bc2406110b926393aa56f80a40eba40	73fc7af87114b39712e6da79b0a377eb	4	NaN	nan	2018-01-18 00:00:00	2018-01-18 21:46:59	nan	fd25ab760bfbba13c198fa3b4f1a0cd3	esporte_lazer	sports_leisure	2.0	6d803cb79cc31c41c4c789a75933b3c7	185.00	41dcb106f807e993532d446263290104	68a5590b9926689be4e10f4ae2db21a8	6030	osasco	SP
2	80e641a11e56f04c1ad469d5645fdfde	a548910a1c6147796b98fdf73dbeba33	5	NaN	nan	2018-03-10 00:00:00	2018-03-11 03:05:13	nan	be0dbdc3d67d55727a65d4cd696ca73c	informatica_acessorios	computers_accessories	1.0	8e6d7754bc7e0f22c96d255ebda59eba	79.79	8a2e7ef9053dea531e4dc76bd6d853e6	64190b91b656ab8f37eb89b93dc84584	13380	nova odessa	SP
3	228ce5500dc1d8e020d8d1322874b6f0	f9e4b658b201a9f2ecdecbb34bed034b	5	NaN	nan	2018-02-17 00:00:00	2018-02-18 14:36:24	nan	d1c427060a0f73f6b889a5c7c61f2ac4	informatica_acessorios	computers_accessories	1.0	a1043bafd471dff536d0c462352beb48	149.00	e226dfed6544df5b7b87a48208690feb	1d47144362c14e94ccdd213e8ec277d5	44571	santo antonio de jesus	BA
4	e64fb393e7b32834bb789ff8bb30750e	658677c97b385a9be170737859d3511b	5	NaN	Recebi bem antes do prazo estipulado.	2017-04-21 00:00:00	2017-04-21 22:02:06	recebi ante do prazo estipulado	52c80cedd4e90108bf4fa6a206ef6b03	ferramentas_jardim	garden_tools	1.0	a1043bafd471dff536d0c462352beb48	179.99	de6dff97e5f1ba84a3cd9a3bc97df5f6	c8cf6cb6b838dc7a33ed199b825e8616	88735	gravatal	SC

	review_id	order_id	review_score	review_comment_title	review_comment_message	review_creation_date	review_answer_timestamp	review_comment_message_processed	product_id	product_category_name	...	order_item_id	seller_id	price	customer_id	customer_unique_id	customer_zip_code_prefix	customer_city	customer_state	reviews_including_titles	sentiment_from_score
0	7bc2406110b926393aa56f80a40eba40	73fc7af87114b39712e6da79b0a377eb	4	NaN	nan	2018-01-18 00:00:00	2018-01-18 21:46:59	nan	fd25ab760bfbba13c198fa3b4f1a0cd3	esporte_lazer	...	1.0	6d803cb79cc31c41c4c789a75933b3c7	185.00	41dcb106f807e993532d446263290104	68a5590b9926689be4e10f4ae2db21a8	6030	osasco	SP	nan	1
1	7bc2406110b926393aa56f80a40eba40	73fc7af87114b39712e6da79b0a377eb	4	NaN	nan	2018-01-18 00:00:00	2018-01-18 21:46:59	nan	fd25ab760bfbba13c198fa3b4f1a0cd3	esporte_lazer	...	2.0	6d803cb79cc31c41c4c789a75933b3c7	185.00	41dcb106f807e993532d446263290104	68a5590b9926689be4e10f4ae2db21a8	6030	osasco	SP	nan	1
2	80e641a11e56f04c1ad469d5645fdfde	a548910a1c6147796b98fdf73dbeba33	5	NaN	nan	2018-03-10 00:00:00	2018-03-11 03:05:13	nan	be0dbdc3d67d55727a65d4cd696ca73c	informatica_acessorios	...	1.0	8e6d7754bc7e0f22c96d255ebda59eba	79.79	8a2e7ef9053dea531e4dc76bd6d853e6	64190b91b656ab8f37eb89b93dc84584	13380	nova odessa	SP	nan	1
3	228ce5500dc1d8e020d8d1322874b6f0	f9e4b658b201a9f2ecdecbb34bed034b	5	NaN	nan	2018-02-17 00:00:00	2018-02-18 14:36:24	nan	d1c427060a0f73f6b889a5c7c61f2ac4	informatica_acessorios	...	1.0	a1043bafd471dff536d0c462352beb48	149.00	e226dfed6544df5b7b87a48208690feb	1d47144362c14e94ccdd213e8ec277d5	44571	santo antonio de jesus	BA	nan	1
4	e64fb393e7b32834bb789ff8bb30750e	658677c97b385a9be170737859d3511b	5	NaN	Recebi bem antes do prazo estipulado.	2017-04-21 00:00:00	2017-04-21 22:02:06	recebi ante do prazo estipulado	52c80cedd4e90108bf4fa6a206ef6b03	ferramentas_jardim	...	1.0	a1043bafd471dff536d0c462352beb48	179.99	de6dff97e5f1ba84a3cd9a3bc97df5f6	c8cf6cb6b838dc7a33ed199b825e8616	88735	gravatal	SC	recebi ante do prazo estipulado	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
114095	f3897127253a9592a73be9bdfdf4ed7a	22ec9f0669f784db00fa86d035cf8602	5	NaN	nan	2017-12-09 00:00:00	2017-12-11 20:06:42	nan	3a33c980b62eb1ef3b8ae61b6fc6fe55	brinquedos	...	1.0	46dc3b2cc0980fb8ec44634e21d2718e	199.99	d0d7086dea6fcf42b9b690b9f3745c58	597cbb334f18a671472f7e16648228b4	22793	rio de janeiro	RJ	nan	1
114096	b3de70c89b1510c4cd3d0649fd302472	55d4004744368f5571d1f590031933e4	5	NaN	Excelente mochila, entrega super rápida. Super...	2018-03-22 00:00:00	2018-03-23 09:10:43	excelente mochila entrega super rápida super r...	8ba0118a487ec8671aed57e5ef846574	papelaria	...	1.0	17ca9b9e9b9ef8fdb529001b49ebb50f	215.97	fcc7b1caafe3b77fd587bab964c4d1fb	c860357db400d72a2497064f8376fba9	37200	lavras	MG	excelente mochila entrega super rápida super r...	1
114097	1adeb9d84d72fe4e337617733eb85149	7725825d039fc1f0ceb7635e3f7d9206	4	NaN	nan	2018-07-01 00:00:00	2018-07-02 12:59:13	nan	73a7fbf8c1048131f3b531af31bcdf0e	esporte_lazer	...	1.0	8d956fec2e4337affcb520f56fd8cbfd	50.95	3aa00401736823c73e9fe8683328fa6b	96ac4cb74918d3ace141d3d3ddc1ff02	32667	betim	MG	nan	1
114098	be360f18f5df1e0541061c87021e6d93	f8bd3f2000c28c5342fedeb5e50f2e75	1	NaN	Solicitei a compra de uma capa de retrovisor c...	2017-12-15 00:00:00	2017-12-16 01:29:43	solicitei a compra de capa de retrovisor celt...	54caa022e792f1ce31d4a656cecaa802	automotivo	...	1.0	25cf099de44674fde97473224f9d59ab	10.00	8df587ce8a11ee97b3de9ef3405245c2	c3793040f54f0f511e5bcaa8937d0b0a	18071	sorocaba	SP	solicitei a compra de capa de retrovisor celt...	0
114099	efe49f1d6f951dd88b51e6ccd4cc548f	90531360ecb1eec2a1fbb265a0db0508	1	NaN	meu produto chegou e ja tenho que devolver, po...	2017-07-03 00:00:00	2017-07-03 21:01:49	produto chegou ja que devolver poi defeit...	8fbd36d3b045f5f38b252b1513478f38	informatica_acessorios	...	1.0	7e3f87d16fb353f408d467e74fbd8014	32.90	f6fff47846276bed044b63474bd38884	432d4f5ec4f462779a48a51c33950351	7085	guarulhos	SP	produto chegou ja que devolver poi defeit...	0

3.1 - Implement new dataset for Sentiment Analysis¶

Portuguese Tweets for Sentiment Analysis ¶

We are going to use the Sentiment Analysis technique to the reviews data of our dataset in an attempt to identify our reviews as positive, negative or neutral.¶

After that we will train a model that can predict this and finally because we have the actuall scores for each review we can compare our predictions with the actual score and get a good idea about our algorithm perfomance.¶

In order to achieve that, we are going to use this dataset as helper.¶

Training datasets:¶

Test datasets:¶

All of them have an equal number of instances between classes. Their sentiment labels were transformed as follow:¶

Tweets Cleanup¶

Lets take a first look to our new twitter data¶

We are now going to rename the sentiment labels to 0 for negative and 1 for positive and apply our cleanup_text function to cleanup the texts¶

cleanup_text function will do the following: remove punctuation, apply lowercase, remove stopwords, apply lemmatization¶

Lets display the proportion of positive and negative reviews¶

Because we dont want our predictions to be biased in favor of negative sentiments we will delete 259.600 negative reviews in order to have equal number of positive and negative reviews to our dataset.¶

3.2 - Build a Sentimental Model¶

The TfidfVectorizer will tokenize reviews, learn the vocabulary and inverse reviews frequency weightings, and allow us to encode new reviews.¶

Lets try to predict¶

Our model has an accuracy of 76% in test data. We assume that this is enough.¶

3.3 - Predict sentiment of Olist reviews¶

Data selection¶

By merging the title with the review the number of reviews that can be used increased.¶

Lets apply again our cleanup_text function to the reviews¶

Here we define a function for converting review score to score for sentiment analysis¶

Now lets convert them¶

Finally, lets try to predict sentiment using the model above¶

3.4 - Evaluation¶

Now lets find the number of data where the predicted sentiment was actually wrong according to the actual score of the review.¶

lets print 2 examples just to confirm that these were predicted incorrectly¶

3.1 - Implement new dataset for Sentiment Analysis¶

Portuguese Tweets for Sentiment Analysis¶

We are going to use the Sentiment Analysis technique to the reviews data of our dataset in an attempt to identify our reviews as positive, negative or neutral.¶

After that we will train a model that can predict this and finally because we have the actuall scores for each review we can compare our predictions with the actual score and get a good idea about our algorithm perfomance.¶

In order to achieve that, we are going to use this dataset as helper.¶

Training datasets:¶

Test datasets:¶

All of them have an equal number of instances between classes. Their sentiment labels were transformed as follow:¶

Tweets Cleanup¶

Lets take a first look to our new twitter data¶

We are now going to rename the sentiment labels to 0 for negative and 1 for positive and apply our cleanup_text function to cleanup the texts¶

cleanup_text function will do the following: remove punctuation, apply lowercase, remove stopwords, apply lemmatization¶

Lets display the proportion of positive and negative reviews¶

Because we dont want our predictions to be biased in favor of negative sentiments we will delete 259.600 negative reviews in order to have equal number of positive and negative reviews to our dataset.¶

3.2 - Build a Sentimental Model¶

The TfidfVectorizer will tokenize reviews, learn the vocabulary and inverse reviews frequency weightings, and allow us to encode new reviews.¶

Lets try to predict¶

Our model has an accuracy of 76% in test data. We assume that this is enough.¶

3.3 - Predict sentiment of Olist reviews¶

Data selection¶

By merging the title with the review the number of reviews that can be used increased.¶

Lets apply again our cleanup_text function to the reviews¶

Here we define a function for converting review score to score for sentiment analysis¶

Now lets convert them¶

Finally, lets try to predict sentiment using the model above¶

3.4 - Evaluation¶

Now lets find the number of data where the predicted sentiment was actually wrong according to the actual score of the review.¶

lets print 2 examples just to confirm that these were predicted incorrectly¶

Portuguese Tweets for Sentiment Analysis ¶