import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import json
import re
import requests
import tweepy
from PIL import Image
from io import BytesIO

plt.style.use('fivethirtyeight')
%matplotlib inline


data_dir = "../Data/"  # Data directory folder

twitter_archive = pd.read_csv(f'{data_dir}twitter-archive-enhanced.csv')


response = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')


# Saving the image prediction into a tsv file
with open(f'{data_dir}image_predictions.tsv', 'wb') as file: 
    file.write(response.content)


image_pred = pd.read_csv(f'{data_dir}image_predictions.tsv', sep='\t')


from getpass import getpass # Package to get consumer key and secret as a password


consumer_key = getpass("Consumer key:") # Getting the consumer key from stdin
consumer_secret = getpass("Consumer secret:") # Getting the consumer secret from stdin

auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 

access_token = getpass("Access Token:")
access_secret = getpass('Access secret:')

auth.set_access_token(access_token, access_secret) # Setting the access tokens

api = tweepy.API(auth, wait_on_rate_limit=True) # Initializing the API

with open(f'{data_dir}tweet_json.txt', 'w') as file:
    for tweetid in twitter_archive['tweet_id'].values:
        try:
            tweet = api.get_status(tweetid, tweet_mode='extended') # Getting different tweets
            json.dump(tweet._json, file) # Saving the tweet as json file
            file.write('\n') # Adding newline character after saving 
        except:
            print(f'{tweetid} not found!') # This will get prompt when tweet id is not found


tweets_use = (pd.read_json(f'{data_dir}tweet_json.txt', lines=True)
                .loc[:, ['id', 'retweet_count', 'favorite_count']]) # Loading useful columns for the analysis


twitter_archive.head()


twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 non-null   object 
 14  floofer                     2356 non-null   object 
 15  pupper                      2356 non-null   object 
 16  puppo                       2356 non-null   object 
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB


twitter_archive.isna().sum()

tweet_id                         0
in_reply_to_status_id         2278
in_reply_to_user_id           2278
timestamp                        0
source                           0
text                             0
retweeted_status_id           2175
retweeted_status_user_id      2175
retweeted_status_timestamp    2175
expanded_urls                   59
rating_numerator                 0
rating_denominator               0
name                             0
doggo                            0
floofer                          0
pupper                           0
puppo                            0
dtype: int64


twitter_archive.describe()


wrong_denom = (twitter_archive.query('rating_denominator != 10')[
    ['text', 'rating_numerator', 'rating_denominator']]
    .head())
wrong_denom


wrong_denom.loc[313, 'text']

"@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho"


# Checking for wrong ratings representation
wrong_repr = (twitter_archive[twitter_archive['text']
                .str.contains(r'\d+\.\d+/\d+')]
                [['text', 'rating_numerator', 'rating_denominator']])
wrong_repr.head()


wrong_repr.loc[45, 'text']

'This is Bella. She hopes her smile made you smile. If not, she is also offering you her favorite monkey. 13.5/10 https://t.co/qjrljjt948'


wrong_repr.loc[340, 'text']

"RT @dog_rates: This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wu…"


wrong_repr.loc[695, 'text']

"This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wuqaPS"


# Checking features of 10 ratings numerator that is above the normal interval (14)

twitter_archive.query('rating_numerator > 14')['text'].to_list()[:10]

['@roushfenway These are good dogs but 17/10 is an emotional impulse rating. More like 13/10s',
 '@dhmontgomery We also gave snoop dogg a 420/10 but I think that predated your research',
 '@s8n You tried very hard to portray this good boy as not so good, but you have ultimately failed. His goodness shines through. 666/10',
 'RT @KibaDva: I collected all the good dogs!! 15/10 @dog_rates #GoodDogs https://t.co/6UCGFczlOI',
 '@markhoppus 182/10',
 '@bragg6of8 @Andy_Pace_ we are still looking for the first 15/10',
 "@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho",
 "RT @dog_rates: This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wu…",
 'The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd',
 'Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx']


# Checking the features of 10 ratings denominator that is above the normal interval (10)

twitter_archive.query('rating_denominator > 10')['text'].to_list()[:10]

['@docmisterio account started on 11/15/15',
 'The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd',
 'RT @dog_rates: After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https:/…',
 'Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE',
 'After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ',
 'Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv',
 'Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a',
 'This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq',
 "Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1",
 "Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12"]


twitter_archive.tweet_id.duplicated().any()

False


twitter_archive['name'].value_counts()

None          745
a              55
Charlie        12
Cooper         11
Lucy           11
             ... 
Dex             1
Ace             1
Tayzie          1
Grizzie         1
Christoper      1
Name: name, Length: 957, dtype: int64


twitter_archive['name'].value_counts().sort_index()

Abby            2
Ace             1
Acro            1
Adele           1
Aiden           1
               ..
such            1
the             8
this            1
unacceptable    1
very            5
Name: name, Length: 957, dtype: int64


# Checking for the unique values
for column in twitter_archive.loc[:, "doggo":].columns:
    print(column, "column values:", twitter_archive[column].unique())

doggo column values: ['None' 'doggo']
floofer column values: ['None' 'floofer']
pupper column values: ['None' 'pupper']
puppo column values: ['None' 'puppo']


tweets_use.head()


tweets_use.id.duplicated().any()

False


# brief information of tweets
tweets_use.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   id              2354 non-null   int64
 1   retweet_count   2354 non-null   int64
 2   favorite_count  2354 non-null   int64
dtypes: int64(3)
memory usage: 55.3 KB


tweets_use.describe()


image_pred.head()


# brief info of image prediction dataframe
image_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


image_pred.describe()


twitter_archive_clean = twitter_archive.copy()
tweets_clean = tweets_use.copy()
image_pred_clean = image_pred.copy()


twitter_archive_clean['timestamp'] = pd.to_datetime(twitter_archive_clean['timestamp'])

twitter_archive_clean.sort_values('timestamp', inplace=True)


twitter_archive_clean['tweet_id'] = twitter_archive_clean['tweet_id'].astype(str)


twitter_archive_clean['timestamp'].head()

2355   2015-11-15 22:32:08+00:00
2354   2015-11-15 23:05:30+00:00
2353   2015-11-15 23:21:54+00:00
2352   2015-11-16 00:04:52+00:00
2351   2015-11-16 00:24:50+00:00
Name: timestamp, dtype: datetime64[ns, UTC]


twitter_archive_clean.head()


twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 2355 to 0
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2356 non-null   object             
 1   in_reply_to_status_id       78 non-null     float64            
 2   in_reply_to_user_id         78 non-null     float64            
 3   timestamp                   2356 non-null   datetime64[ns, UTC]
 4   source                      2356 non-null   object             
 5   text                        2356 non-null   object             
 6   retweeted_status_id         181 non-null    float64            
 7   retweeted_status_user_id    181 non-null    float64            
 8   retweeted_status_timestamp  181 non-null    object             
 9   expanded_urls               2297 non-null   object             
 10  rating_numerator            2356 non-null   int64              
 11  rating_denominator          2356 non-null   int64              
 12  name                        2356 non-null   object             
 13  doggo                       2356 non-null   object             
 14  floofer                     2356 non-null   object             
 15  pupper                      2356 non-null   object             
 16  puppo                       2356 non-null   object             
dtypes: datetime64[ns, UTC](1), float64(4), int64(2), object(10)
memory usage: 331.3+ KB


twitter_archive_clean['date'] = pd.to_datetime(twitter_archive_clean['timestamp'].dt.date)
twitter_archive_clean['time'] = twitter_archive_clean['timestamp'].dt.time

# Removing timestamp from the dataframe
twitter_archive_clean.drop(columns=['timestamp'], inplace=True)


twitter_archive_clean.head(2)


twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 2355 to 0
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   tweet_id                    2356 non-null   object        
 1   in_reply_to_status_id       78 non-null     float64       
 2   in_reply_to_user_id         78 non-null     float64       
 3   source                      2356 non-null   object        
 4   text                        2356 non-null   object        
 5   retweeted_status_id         181 non-null    float64       
 6   retweeted_status_user_id    181 non-null    float64       
 7   retweeted_status_timestamp  181 non-null    object        
 8   expanded_urls               2297 non-null   object        
 9   rating_numerator            2356 non-null   int64         
 10  rating_denominator          2356 non-null   int64         
 11  name                        2356 non-null   object        
 12  doggo                       2356 non-null   object        
 13  floofer                     2356 non-null   object        
 14  pupper                      2356 non-null   object        
 15  puppo                       2356 non-null   object        
 16  date                        2356 non-null   datetime64[ns]
 17  time                        2356 non-null   object        
dtypes: datetime64[ns](1), float64(4), int64(2), object(11)
memory usage: 349.7+ KB


twitter_archive_clean.loc[:, "doggo":'puppo'] = twitter_archive_clean.loc[:, "doggo":'puppo'].replace({'None': np.nan})


twitter_archive_clean.loc[:5, "doggo":'puppo']


twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 2355 to 0
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   tweet_id                    2356 non-null   object        
 1   in_reply_to_status_id       78 non-null     float64       
 2   in_reply_to_user_id         78 non-null     float64       
 3   source                      2356 non-null   object        
 4   text                        2356 non-null   object        
 5   retweeted_status_id         181 non-null    float64       
 6   retweeted_status_user_id    181 non-null    float64       
 7   retweeted_status_timestamp  181 non-null    object        
 8   expanded_urls               2297 non-null   object        
 9   rating_numerator            2356 non-null   int64         
 10  rating_denominator          2356 non-null   int64         
 11  name                        2356 non-null   object        
 12  doggo                       97 non-null     object        
 13  floofer                     10 non-null     object        
 14  pupper                      257 non-null    object        
 15  puppo                       30 non-null     object        
 16  date                        2356 non-null   datetime64[ns]
 17  time                        2356 non-null   object        
dtypes: datetime64[ns](1), float64(4), int64(2), object(11)
memory usage: 414.3+ KB


dog_stage = twitter_archive_clean.melt(id_vars= ['tweet_id'],
                value_vars=('doggo', 'floofer', 'pupper', 'puppo'),
                    var_name='dogs_col', value_name='dog_stage')

dog_stage = dog_stage[~dog_stage['dog_stage'].isna()].drop('dogs_col', axis=1)

twitter_archive_clean = pd.merge(twitter_archive_clean, dog_stage, on='tweet_id', how='left')

# dog_stage is no longer
del dog_stage


twitter_archive_clean['dog_stage'] = twitter_archive_clean['dog_stage'].astype('category')

twitter_archive_clean.drop(columns=['doggo', 'floofer', 'pupper', 'puppo'], inplace=True)


# All columns in the dataframe
twitter_archive_clean.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'source',
       'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'date', 'time', 'dog_stage'],
      dtype='object')


twitter_archive_clean.head(2)


twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2370 entries, 0 to 2369
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   tweet_id                    2370 non-null   object        
 1   in_reply_to_status_id       79 non-null     float64       
 2   in_reply_to_user_id         79 non-null     float64       
 3   source                      2370 non-null   object        
 4   text                        2370 non-null   object        
 5   retweeted_status_id         183 non-null    float64       
 6   retweeted_status_user_id    183 non-null    float64       
 7   retweeted_status_timestamp  183 non-null    object        
 8   expanded_urls               2311 non-null   object        
 9   rating_numerator            2370 non-null   int64         
 10  rating_denominator          2370 non-null   int64         
 11  name                        2370 non-null   object        
 12  date                        2370 non-null   datetime64[ns]
 13  time                        2370 non-null   object        
 14  dog_stage                   394 non-null    category      
dtypes: category(1), datetime64[ns](1), float64(4), int64(2), object(7)
memory usage: 280.2+ KB


twitter_archive_clean['source'] = twitter_archive_clean['source'].str.extract(r'<a.*>(.*)</a>')


twitter_archive_clean['source'].head()

0    Twitter for iPhone
1    Twitter for iPhone
2    Twitter for iPhone
3    Twitter for iPhone
4    Twitter for iPhone
Name: source, dtype: object


twitter_archive_clean['text'] = twitter_archive_clean['text'].str.replace(r'\s*http\S*', "", regex=True)


twitter_archive_clean['text'].str.contains('http').sum()

0


twitter_archive_clean = twitter_archive_clean[~twitter_archive_clean.name.str.islower()]

twitter_archive_clean.replace({"None": np.nan}, inplace=True)


twitter_archive_clean.name.value_counts()

Charlie    12
Cooper     11
Oliver     11
Lucy       11
Tucker     10
           ..
Covach      1
Geoff       1
Maxwell     1
Oddie       1
Tilly       1
Name: name, Length: 931, dtype: int64


twitter_archive_clean[['rating_numerator', 'rating_denominator']] = twitter_archive_clean['text'].str.extract(r'([1-9][0-9]*(?:\.\d+)?)/([1-9][0-9]*)(?!/)', expand=True)


twitter_archive_clean[['rating_numerator', 'rating_denominator']] = twitter_archive_clean[['rating_numerator', 'rating_denominator']].astype(float)


twitter_archive_clean.head(2)


twitter_archive_clean['rating_numerator'].value_counts()

12.00      547
11.00      449
10.00      438
13.00      346
9.00       144
8.00        95
14.00       50
7.00        49
5.00        35
6.00        30
3.00        18
4.00        14
1.00         8
2.00         7
15.00        2
9.75         2
420.00       2
182.00       1
84.00        1
666.00       1
24.00        1
17.00        1
50.00        1
11.27        1
165.00       1
1776.00      1
121.00       1
99.00        1
80.00        1
45.00        1
44.00        1
143.00       1
20.00        1
9.50         1
11.26        1
144.00       1
88.00        1
13.50        1
Name: rating_numerator, dtype: int64


twitter_archive_clean['rating_denominator'].value_counts()

10.0     2238
11.0        3
80.0        2
20.0        2
50.0        2
120.0       1
16.0        1
110.0       1
130.0       1
40.0        1
90.0        1
150.0       1
7.0         1
70.0        1
1.0         1
Name: rating_denominator, dtype: int64


twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2259 entries, 0 to 2369
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   tweet_id                    2259 non-null   object        
 1   in_reply_to_status_id       79 non-null     float64       
 2   in_reply_to_user_id         79 non-null     float64       
 3   source                      2259 non-null   object        
 4   text                        2259 non-null   object        
 5   retweeted_status_id         177 non-null    float64       
 6   retweeted_status_user_id    177 non-null    float64       
 7   retweeted_status_timestamp  177 non-null    object        
 8   expanded_urls               2200 non-null   object        
 9   rating_numerator            2257 non-null   float64       
 10  rating_denominator          2257 non-null   float64       
 11  name                        1506 non-null   object        
 12  date                        2259 non-null   datetime64[ns]
 13  time                        2259 non-null   object        
 14  dog_stage                   376 non-null    category      
dtypes: category(1), datetime64[ns](1), float64(6), object(7)
memory usage: 267.1+ KB


# Some ratings denominator > 10 are invalid for this analysis. 
twitter_archive_clean = twitter_archive_clean.query('rating_denominator == 10 & rating_numerator <= 14')

# final ratings in percentage
twitter_archive_clean.rename(columns={'rating_numerator': 'final_ratings'}, inplace=True)


twitter_archive_clean.drop('rating_denominator', axis=1, inplace=True)


twitter_archive_clean['final_ratings'].head()

0      8.0
4      5.0
11     9.0
12    10.0
13     6.0
Name: final_ratings, dtype: float64


twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2230 entries, 0 to 2369
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   tweet_id                    2230 non-null   object        
 1   in_reply_to_status_id       69 non-null     float64       
 2   in_reply_to_user_id         69 non-null     float64       
 3   source                      2230 non-null   object        
 4   text                        2230 non-null   object        
 5   retweeted_status_id         175 non-null    float64       
 6   retweeted_status_user_id    175 non-null    float64       
 7   retweeted_status_timestamp  175 non-null    object        
 8   expanded_urls               2179 non-null   object        
 9   final_ratings               2230 non-null   float64       
 10  name                        1502 non-null   object        
 11  date                        2230 non-null   datetime64[ns]
 12  time                        2230 non-null   object        
 13  dog_stage                   376 non-null    category      
dtypes: category(1), datetime64[ns](1), float64(5), object(7)
memory usage: 246.3+ KB


twitter_archive_clean = twitter_archive_clean[twitter_archive_clean['retweeted_status_id'].isna()]
twitter_archive_clean = twitter_archive_clean[twitter_archive_clean['in_reply_to_user_id'].isna()]


twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1986 entries, 0 to 2369
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   tweet_id                    1986 non-null   object        
 1   in_reply_to_status_id       0 non-null      float64       
 2   in_reply_to_user_id         0 non-null      float64       
 3   source                      1986 non-null   object        
 4   text                        1986 non-null   object        
 5   retweeted_status_id         0 non-null      float64       
 6   retweeted_status_user_id    0 non-null      float64       
 7   retweeted_status_timestamp  0 non-null      object        
 8   expanded_urls               1983 non-null   object        
 9   final_ratings               1986 non-null   float64       
 10  name                        1390 non-null   object        
 11  date                        1986 non-null   datetime64[ns]
 12  time                        1986 non-null   object        
 13  dog_stage                   332 non-null    category      
dtypes: category(1), datetime64[ns](1), float64(5), object(7)
memory usage: 219.4+ KB


colto_drop = (i for i in twitter_archive_clean.columns if re.search(r'retweet.*|in_reply.*', i))

twitter_archive_clean.drop(columns=colto_drop, inplace=True)


twitter_archive_clean.columns

Index(['tweet_id', 'source', 'text', 'expanded_urls', 'final_ratings', 'name',
       'date', 'time', 'dog_stage'],
      dtype='object')


twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1986 entries, 0 to 2369
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   tweet_id       1986 non-null   object        
 1   source         1986 non-null   object        
 2   text           1986 non-null   object        
 3   expanded_urls  1983 non-null   object        
 4   final_ratings  1986 non-null   float64       
 5   name           1390 non-null   object        
 6   date           1986 non-null   datetime64[ns]
 7   time           1986 non-null   object        
 8   dog_stage      332 non-null    category      
dtypes: category(1), datetime64[ns](1), float64(1), object(6)
memory usage: 141.8+ KB


image_pred_clean['tweet_id'] = image_pred_clean['tweet_id'].astype(str)
tweets_clean['id'] = tweets_clean['id'].astype(str)


image_pred_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   object 
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(1), object(5)
memory usage: 152.1+ KB


tweets_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              2354 non-null   object
 1   retweet_count   2354 non-null   int64 
 2   favorite_count  2354 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 55.3+ KB


def best_prediction(tweet_image):
    """
        This function will return the best image prediction breed
    """
    pred = None
    if (tweet_image['p1_conf'] >= tweet_image['p2_conf']
             and
        tweet_image['p1_conf'] >= tweet_image['p3_conf']):
        pred = tweet_image['p1']
    elif tweet_image['p2_conf'] > tweet_image['p3_conf']:
        pred = tweet_image['p2']
    else:
        pred = tweet_image['p3'] 
    return pred


image_pred_clean['dog_breed'] = image_pred_clean.apply(best_prediction, axis=1)


image_pred_clean = image_pred_clean[['tweet_id', 'dog_breed', 'jpg_url']]


image_pred_clean.head(3)


image_pred_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   2075 non-null   object
 1   dog_breed  2075 non-null   object
 2   jpg_url    2075 non-null   object
dtypes: object(3)
memory usage: 48.8+ KB


# replacing all underscore with space
image_pred_clean['dog_breed'] = image_pred_clean['dog_breed'].str.replace("_", " ")


# capitalizing all dogs breed
image_pred_clean['dog_breed'] = image_pred_clean['dog_breed'].str.title()


image_pred_clean.head()


image_pred_clean.tail()


image_pred_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   2075 non-null   object
 1   dog_breed  2075 non-null   object
 2   jpg_url    2075 non-null   object
dtypes: object(3)
memory usage: 48.8+ KB


# Merging both twitter archive and additional tweet info
df_clean = (pd.merge(twitter_archive_clean, 
                tweets_clean, left_on= 'tweet_id',
                    right_on='id', how='inner')
                        .drop('id', axis=1))

# Merging twitter archive with tweet image prediction
df_clean = pd.merge(df_clean, image_pred_clean, on = 'tweet_id', how='left')


df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1986 entries, 0 to 1985
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   tweet_id        1986 non-null   object        
 1   source          1986 non-null   object        
 2   text            1986 non-null   object        
 3   expanded_urls   1983 non-null   object        
 4   name            1390 non-null   object        
 5   date            1986 non-null   datetime64[ns]
 6   time            1986 non-null   object        
 7   dog_stage       332 non-null    category      
 8   final_ratings   1986 non-null   float64       
 9   retweet_count   1986 non-null   int64         
 10  favorite_count  1986 non-null   int64         
 11  dog_breed       1865 non-null   object        
 12  jpg_url         1865 non-null   object        
dtypes: category(1), datetime64[ns](1), float64(1), int64(2), object(8)
memory usage: 203.8+ KB


df_clean.head()


df_clean = df_clean[~df_clean['jpg_url'].isna()]


df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1865 entries, 0 to 1985
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   tweet_id        1865 non-null   object        
 1   source          1865 non-null   object        
 2   text            1865 non-null   object        
 3   expanded_urls   1865 non-null   object        
 4   name            1349 non-null   object        
 5   date            1865 non-null   datetime64[ns]
 6   time            1865 non-null   object        
 7   dog_stage       300 non-null    category      
 8   final_ratings   1865 non-null   float64       
 9   retweet_count   1865 non-null   int64         
 10  favorite_count  1865 non-null   int64         
 11  dog_breed       1865 non-null   object        
 12  jpg_url         1865 non-null   object        
dtypes: category(1), datetime64[ns](1), float64(1), int64(2), object(8)
memory usage: 191.4+ KB


# Renaming the columns name
df_clean.rename(columns={'expanded_urls': 'tweet_url', 'name': 'dog_name'}, inplace=True)


# make tweet_id as index of the dataframe

df_clean.set_index('tweet_id', inplace=True)


df_clean.head(2)


df_clean = df_clean[['date', 'time', 'tweet_url', 'text', 'source', 'retweet_count', 'favorite_count','dog_name', 'dog_stage', 'dog_breed', 'final_ratings', 'jpg_url']]


df_clean.head()


df_clean.to_csv(f'{data_dir}twitter_archive_master.csv')


viz_data = df_clean.drop(columns=['time', 'tweet_url', 'text', 'source'])


viz_data.head()


fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(viz_data['date'], viz_data['favorite_count'], color='b', label='Likes')
ax.plot(viz_data['date'], viz_data['retweet_count'], color='r', label='Retweets')
plt.xlabel('Date')
plt.title('Retweets and Likes trend overtime')
plt.legend();


fig, ax = plt.subplots(figsize=(12, 9))
retw_like_cor = viz_data.corr().loc['favorite_count', 'retweet_count'] # Correlation
sns.regplot(x=viz_data['favorite_count'], y=viz_data['retweet_count'], ax=ax)
ax.set_xlabel('Likes', fontsize=12)
ax.set_ylabel('Retweets', fontsize=12)
ax.set_title('Likes & Retweets correlation')
ax.legend([(f"Correlation: {retw_like_cor:.3f}")]);


plt.figure(figsize=(12, 8))
viz_data['final_ratings'].plot(kind='hist')
plt.xlabel('Ratings')
plt.title('Ratings distribution');


dog_ratings = viz_data[['dog_name', 'final_ratings']].dropna()
dog_ratings.set_index('dog_name', inplace=True)


dog_ratings = dog_ratings['final_ratings'].sort_values(ascending=False)


fig, ax = plt.subplots(figsize=(8, 12))
dog_ratings.head(20).plot.barh()
ax.set_ylabel("Dogs")
ax.set_xlabel('Ratings')
ax.set_title("Twenty dogs with higher ratings", size=14, color='b');


# dog breed count with na values
breed_count = (viz_data['dog_breed']
                .value_counts()
                    )


breed_count.head(10).plot.bar(
    figsize=(12, 8), 
    title="Top 10 most common dog breeds"
);


breed_count.tail(20).plot.pie(
    explode=[0.1]*20,
    wedgeprops = {'linewidth': 2},
    figsize=(12, 8), 
    title="20 rare dog breeds",
    ylabel="",
);


dog_highest_likes = viz_data.nlargest(5, columns='favorite_count')

dog_highest_likes


link = dog_highest_likes.loc[:,'jpg_url'][0]

# Dog with highest likes
response = requests.get(link)


img = Image.open(BytesIO(response.content))
img.resize(size=(600, 400))


dog_highest_ret = viz_data.nlargest(5, columns='retweet_count')

dog_highest_ret


link = dog_highest_ret.loc[:,'jpg_url'][0]

response = requests.get(link)


img = Image.open(BytesIO(response.content))
img.resize(size=(600, 400))

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
0	892420643555336193	NaN	NaN	2017-08-01 16:23:56 +0000	<a href="http://twitter.com/download/iphone" r...	This is Phineas. He's a mystical boy. Only eve...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/892420643...	13	10	Phineas	None	None	None	None
1	892177421306343426	NaN	NaN	2017-08-01 00:17:27 +0000	<a href="http://twitter.com/download/iphone" r...	This is Tilly. She's just checking pup on you....	NaN	NaN	NaN	https://twitter.com/dog_rates/status/892177421...	13	10	Tilly	None	None	None	None
2	891815181378084864	NaN	NaN	2017-07-31 00:18:03 +0000	<a href="http://twitter.com/download/iphone" r...	This is Archie. He is a rare Norwegian Pouncin...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891815181...	12	10	Archie	None	None	None	None
3	891689557279858688	NaN	NaN	2017-07-30 15:58:51 +0000	<a href="http://twitter.com/download/iphone" r...	This is Darla. She commenced a snooze mid meal...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891689557...	13	10	Darla	None	None	None	None
4	891327558926688256	NaN	NaN	2017-07-29 16:00:24 +0000	<a href="http://twitter.com/download/iphone" r...	This is Franklin. He would like you to stop ca...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891327558...	12	10	Franklin	None	None	None	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	retweeted_status_id	retweeted_status_user_id	rating_numerator	rating_denominator
count	2.356000e+03	7.800000e+01	7.800000e+01	1.810000e+02	1.810000e+02	2356.000000	2356.000000
mean	7.427716e+17	7.455079e+17	2.014171e+16	7.720400e+17	1.241698e+16	13.126486	10.455433
std	6.856705e+16	7.582492e+16	1.252797e+17	6.236928e+16	9.599254e+16	45.876648	6.745237
min	6.660209e+17	6.658147e+17	1.185634e+07	6.661041e+17	7.832140e+05	0.000000	0.000000
25%	6.783989e+17	6.757419e+17	3.086374e+08	7.186315e+17	4.196984e+09	10.000000	10.000000
50%	7.196279e+17	7.038708e+17	4.196984e+09	7.804657e+17	4.196984e+09	11.000000	10.000000
75%	7.993373e+17	8.257804e+17	4.196984e+09	8.203146e+17	4.196984e+09	12.000000	10.000000
max	8.924206e+17	8.862664e+17	8.405479e+17	8.874740e+17	7.874618e+17	1776.000000	170.000000

	id	retweet_count	favorite_count
0	892420643555336193	8853	39467
1	892177421306343426	6514	33819
2	891815181378084864	4328	25461
3	891689557279858688	8964	42908
4	891327558926688256	9774	41048

	id	retweet_count	favorite_count
count	2.354000e+03	2354.000000	2354.000000
mean	7.426978e+17	3164.797366	8080.968564
std	6.852812e+16	5284.770364	11814.771334
min	6.660209e+17	0.000000	0.000000
25%	6.783975e+17	624.500000	1415.000000
50%	7.194596e+17	1473.500000	3603.500000
75%	7.993058e+17	3652.000000	10122.250000
max	8.924206e+17	79515.000000	132810.000000

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
0	666020888022790149	https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg	1	Welsh_springer_spaniel	0.465074	True	collie	0.156665	True	Shetland_sheepdog	0.061428	True
1	666029285002620928	https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg	1	redbone	0.506826	True	miniature_pinscher	0.074192	True	Rhodesian_ridgeback	0.072010	True
2	666033412701032449	https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg	1	German_shepherd	0.596461	True	malinois	0.138584	True	bloodhound	0.116197	True
3	666044226329800704	https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg	1	Rhodesian_ridgeback	0.408143	True	redbone	0.360687	True	miniature_pinscher	0.222752	True
4	666049248165822465	https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg	1	miniature_pinscher	0.560311	True	Rottweiler	0.243682	True	Doberman	0.154629	True

	text	rating_numerator	rating_denominator
313	@jonnysun @Lin_Manuel ok jomny I know you're e...	960	0
342	@docmisterio account started on 11/15/15	11	15
433	The floofs have been released I repeat the flo...	84	70
516	Meet Sam. She smiles 24/7 & secretly aspir...	24	7
784	RT @dog_rates: After so many requests, this is...	9	11

	text	rating_numerator	rating_denominator
45	This is Bella. She hopes her smile made you sm...	5	10
340	RT @dog_rates: This is Logan, the Chow who liv...	75	10
695	This is Logan, the Chow who lived. He solemnly...	75	10
763	This is Sophie. She's a Jubilant Bush Pupper. ...	27	10
1689	I've been told there's a slight possibility he...	5	10

	tweet_id	img_num	p1_conf	p2_conf	p3_conf
count	2.075000e+03	2075.000000	2075.000000	2.075000e+03	2.075000e+03
mean	7.384514e+17	1.203855	0.594548	1.345886e-01	6.032417e-02
std	6.785203e+16	0.561875	0.271174	1.006657e-01	5.090593e-02
min	6.660209e+17	1.000000	0.044333	1.011300e-08	1.740170e-10
25%	6.764835e+17	1.000000	0.364412	5.388625e-02	1.622240e-02
50%	7.119988e+17	1.000000	0.588230	1.181810e-01	4.944380e-02
75%	7.932034e+17	1.000000	0.843855	1.955655e-01	9.180755e-02
max	8.924206e+17	4.000000	1.000000	4.880140e-01	2.734190e-01

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
2355	666020888022790149	NaN	NaN	2015-11-15 22:32:08+00:00	<a href="http://twitter.com/download/iphone" r...	Here we have a Japanese Irish Setter. Lost eye...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666020888...	8	10	None	None	None	None	None
2354	666029285002620928	NaN	NaN	2015-11-15 23:05:30+00:00	<a href="http://twitter.com/download/iphone" r...	This is a western brown Mitsubishi terrier. Up...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666029285...	7	10	a	None	None	None	None
2353	666033412701032449	NaN	NaN	2015-11-15 23:21:54+00:00	<a href="http://twitter.com/download/iphone" r...	Here is a very happy pup. Big fan of well-main...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666033412...	9	10	a	None	None	None	None
2352	666044226329800704	NaN	NaN	2015-11-16 00:04:52+00:00	<a href="http://twitter.com/download/iphone" r...	This is a purebred Piers Morgan. Loves to Netf...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666044226...	6	10	a	None	None	None	None
2351	666049248165822465	NaN	NaN	2015-11-16 00:24:50+00:00	<a href="http://twitter.com/download/iphone" r...	Here we have a 1949 1st generation vulpix. Enj...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666049248...	5	10	None	None	None	None	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	date	time	dog_stage
0	666020888022790149	NaN	NaN	Twitter for iPhone	Here we have a Japanese Irish Setter. Lost eye...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666020888...	8.0	10.0	NaN	2015-11-15	22:32:08	NaN
4	666049248165822465	NaN	NaN	Twitter for iPhone	Here we have a 1949 1st generation vulpix. Enj...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666049248...	5.0	10.0	NaN	2015-11-16	00:24:50	NaN

	tweet_id	dog_breed	jpg_url
0	666020888022790149	Welsh Springer Spaniel	https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg
1	666029285002620928	Redbone	https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg
2	666033412701032449	German Shepherd	https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg
3	666044226329800704	Rhodesian Ridgeback	https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg
4	666049248165822465	Miniature Pinscher	https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg

	tweet_id	dog_breed	jpg_url
2070	891327558926688256	Basset	https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg
2071	891689557279858688	Paper Towel	https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg
2072	891815181378084864	Chihuahua	https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg
2073	892177421306343426	Chihuahua	https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg
2074	892420643555336193	Orange	https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg

	date	retweet_count	favorite_count	dog_name	dog_stage	dog_breed	final_ratings	jpg_url
tweet_id
822872901745569793	2017-01-21	48265	132810	NaN	puppo	Lakeland Terrier	13.0	https://pbs.twimg.com/media/C2tugXLXgAArJO4.jpg
744234799360020481	2016-06-18	79515	131075	NaN	doggo	Labrador Retriever	13.0	https://pbs.twimg.com/ext_tw_video_thumb/74423...
879415818425184262	2017-06-26	45849	107956	Duddles	NaN	English Springer	13.0	https://pbs.twimg.com/ext_tw_video_thumb/87941...
807106840509214720	2016-12-09	56625	107015	Stephan	NaN	Chihuahua	13.0	https://pbs.twimg.com/ext_tw_video_thumb/80710...
866450705531457537	2017-05-22	32883	106827	Jamesy	pupper	French Bulldog	13.0	https://pbs.twimg.com/media/DAZAUfBXcAAG_Nn.jpg

Data Wrangling and Visualization on WeRateDogs Tweets Information¶

Introduction¶

Data description¶

Data Gathering Section¶

Assessing Data¶

Twitter archive file¶

Ratings column¶

Tweet id column¶

Name column¶

Dog stages¶

Assessing additional tweet information¶

Assessing image prediction¶

Quality issues¶

Tidiness issues¶

Data Cleaning¶

Data Quality 1¶

Define¶

Code¶

Test¶

Tidiness issue 1¶

Define¶

Code¶

Test¶

Data Quality 2¶

Define¶

Code¶

Test¶

Tidiness issue 2¶

Define¶

Test¶

Data Quality 3¶

Define¶

Code¶

Test¶

Data Quality 4¶

Define¶

Code¶

Test¶

Data Quality 5¶

Define¶

Code¶

Test¶

Data Quality 6¶

Define¶

Code¶

Test¶

Data Quality 7¶

Define¶

Code¶

Test¶

Data Quality 8¶

Define¶

Code¶

Test¶

Data Quality 9¶

Define¶

Code¶

Test¶

Data Quality 10¶

Define¶

Code¶

Test¶

Tidiness issue 3¶

Define¶

Code¶

Test¶

Data Quality 11¶

Define¶

Code¶

Test¶

Tidiness issue 4¶

Define¶

Code¶

Test¶

Data Quality 12¶

Define¶

Code¶

Test¶

Data restructuring¶

Column position restructuring¶

Code ¶