import matplotlib.pyplot as plt
import seaborn as sns
import spacy

from pandas.plotting import register_matplotlib_converters
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pandas as pd

df = pd.read_csv('data/raw/reviews.csv')

# Update the surname extraction method
df['surname'] = df['name'].str.split().str[-1].apply(lambda x: x if len(x) > 1 else None)

# Count occurrences of each surname again
surname_counts = df['surname'].value_counts()

# Filter out surnames that appear more than once
recurring_surnames = surname_counts[surname_counts > 1].index.tolist()

# Filter the dataframe for entries with recurring surnames
recurring_surname_reviews = df[df['surname'].isin(recurring_surnames)]

from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Step 1: Remove records involving the surnames "Rodella" and "Tedeschi"
clean_df = df[~df['surname'].isin(['Rodella', 'Tedeschi'])]

# Step 2: Separate the dataset into two groups
positive_reviews = clean_df[
    (clean_df['rating'] > 2.5)
    & (clean_df['review'].str.strip() != '')
    & (~clean_df['review'].isnull())]

negative_reviews = clean_df[
    (clean_df['rating'] <= 2.5)
    & (clean_df['review'].str.strip() != '')
    & (~clean_df['review'].isnull())]

# Step 3: Generate word clouds for each group
# Positive reviews word cloud
my_additional_stop_words = [
    'Smartpricing', 'smartpricing'
]
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

positive_text = ' '.join(positive_reviews['review'].tolist())
positive_wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stop_words).generate(positive_text)

# Negative reviews word cloud
negative_text = ' '.join(negative_reviews['review'].tolist())
negative_wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stop_words).generate(negative_text)

# Plotting both word clouds side by side
fig, ax = plt.subplots(1, 2, figsize=(16, 8))

ax[0].imshow(positive_wordcloud, interpolation='bilinear')
ax[0].axis('off')
ax[0].set_title('Positive Reviews (>2.5)', fontsize=16)

ax[1].imshow(negative_wordcloud, interpolation='bilinear')
ax[1].axis('off')
ax[1].set_title('Negative Reviews (<=2.5)', fontsize=16)

plt.tight_layout()
plt.show()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

def extract_topics(text_data, num_topics=3, num_words=5):
    # Vectorize the text data
    vectorizer = CountVectorizer(stop_words=list(stop_words))
    text_vectorized = vectorizer.fit_transform(text_data)

    # Fit LDA model
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(text_vectorized)

    # Extract topics
    words = np.array(vectorizer.get_feature_names_out())
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words_idx = topic.argsort()[-num_words:][::-1]
        topics.append([words[i] for i in top_words_idx])
    
    return topics

# Extract topics from positive and negative reviews
positive_topics = extract_topics(positive_reviews['review'].tolist(), num_topics=3, num_words=5)
negative_topics = extract_topics(negative_reviews['review'].tolist(), num_topics=3, num_words=5)

positive_topics, negative_topics

([['optimal', 'excellent', 'service', 'staff', 'helpful'],
  ['years', 'reviews', 'prices', 'benefits', 'time'],
  ['obviously', 'software', 'time', 'excellent', 'reality']],
 [['quick', 'deserved', 'despite', 'heard', 'prices'],
  ['offers', 'weeks', 'sells', 'event', 'adjust'],
  ['receive', 'emails', 'price', 'room', 'sold']])

# Step 1: Identify topics for each review

# Function to identify topic for each review
def assign_topics(text_data, lda_model, vectorizer):
    text_vectorized = vectorizer.transform(text_data)
    topic_distribution = lda_model.transform(text_vectorized)
    assigned_topics = topic_distribution.argmax(axis=1)
    return assigned_topics

# Vectorize and apply LDA to identify topics for each review
def identify_topics_and_ratings(reviews, num_topics=3):
    vectorizer = CountVectorizer(stop_words=list(stop_words))
    text_vectorized = vectorizer.fit_transform(reviews.loc[:, 'review'])
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda_model.fit(text_vectorized)

    # Assign topics to reviews
    reviews.loc[:, 'topic'] = assign_topics(reviews.loc[:, 'review'], lda_model, vectorizer)
    return reviews

# Assign topics to positive and negative reviews
positive_reviews_with_topics = identify_topics_and_ratings(positive_reviews, num_topics=3)
negative_reviews_with_topics = identify_topics_and_ratings(negative_reviews, num_topics=3)

# Step 2: Calculate the median rating for each topic
positive_mean_ratings = positive_reviews_with_topics.groupby('topic')['rating'].mean().reset_index()
negative_mean_ratings = negative_reviews_with_topics.groupby('topic')['rating'].mean().reset_index()

positive_mean_ratings, negative_mean_ratings

/var/folders/6r/t8bfdcmx3j5g7vl5bpvvlp180000gp/T/ipykernel_4547/749697693.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews.loc[:, 'topic'] = assign_topics(reviews.loc[:, 'review'], lda_model, vectorizer)
/var/folders/6r/t8bfdcmx3j5g7vl5bpvvlp180000gp/T/ipykernel_4547/749697693.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews.loc[:, 'topic'] = assign_topics(reviews.loc[:, 'review'], lda_model, vectorizer)

(   topic  rating
 0      0     5.0
 1      1     5.0
 2      2     5.0,
    topic  rating
 0      0    1.00
 1      1    1.00
 2      2    1.25)

Google Reviews analysis¶

Introduction¶

Objective¶

Data Source¶

Analysis Overview¶

Key Findings¶

Next Steps¶

Positive Reviews (Rating > 2.5)¶

Negative Reviews (Rating ≤ 2.5)¶