I want to figure out where Islamaphobic tweets are originating from (which tweets, users, etc) as well as what factors contribute to responses to these tweets.
The given dataset comes with a lot of excess columns so I will try to extract the relevant data.
import pandas as pd
import statsmodels as sm
from collections import defaultdict, Counter
import json
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.colors import Normalize
import seaborn as sns; sns.set()
import networkx as nx
from pyvis.network import Network
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
nltk.download('punkt'); nltk.download('stopwords')
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
import datetime
from tqdm import tqdm
import bar_chart_race as bcr
import twint
import folium
from geopy.geocoders import Nominatim
from folium.plugins import HeatMap
df_50k = pd.read_csv('./noislamophobia-dataset-50k.csv')
df_75k = pd.read_csv('./noislamophobia-dataset-75k.csv')
df = pd.concat([df_50k,df_75k])
df.head()
df.in_reply_to_screen_name.notna().sum()
# Get a username column
df.user[0]
origin = df[['user', 'in_reply_to_screen_name', 'retweet_count', 'retweeted', 'created_at', 'text']]
origin.head()
# want to get username, user_id_str, followers_count, verified out of 'user'
user_df = origin.user.apply(json.loads).apply(pd.Series)
user_df.head()
origin['id_str'] = user_df.id_str.copy()
origin['username'] = user_df.screen_name.copy()
origin['followers_count'] = user_df.followers_count.copy()
origin['verified'] = user_df.verified.copy()
origin = origin[['username', 'id_str', 'followers_count', 'verified', 'in_reply_to_screen_name', 'retweet_count', 'retweeted', 'text', 'created_at']]
origin.head()
# The retweeted column is always false, so recalculate it
origin.retweeted = origin.text.str.startswith('RT')
print(f'Number of tweets that are RTs: {origin.retweeted.sum()}')
# created_at column is not of type datetime
origin.created_at = pd.to_datetime(origin.created_at)
origin[origin.retweeted].head()
I now have the dataframe origin which has the username and some potential measures for how popular the user is and how much "penetration potential" their tweets have.
Will now begin to explore the data.
sns.histplot(data=origin, x='followers_count', binrange=(0, 10000))
origin[origin.verified].head()
# I excludes tweets with < 5 retweets since they are a clear majority and mess up the scale of the graph
sns.histplot(data=origin, x='retweet_count', binrange=(5, 500))
origin.groupby([origin.created_at.dt.year, origin.created_at.dt.month])['username'].count().plot(kind='bar', figsize=(20, 5))
Show which words are the most popular to get a general idea of what people are tweeting about.
# Filter out words that start with things in filter_list or any stopwords
filter_list = ['@', '#', 'http', 'rt', '&', 'islam', 'muslim', 'religion', 'don', 'need', 'know']
stops = set(stopwords.words('english'))
def good_wd(wd):
return not any(wd.lower().startswith(x) for x in filter_list) and wd not in stops
# Clean up tweets
tweets = df.text.dropna().str.split().apply(lambda lst: ' '.join(filter(good_wd, lst)))
# Generate word cloud
wordcloud = WordCloud().generate(' '.join(tweets))
image = wordcloud.to_image()
#image.save('wordcloud.png')
At this point, it seems clear that verified users have much higher Twitter penetration since they are much more involved in the process of retweeting as well as being retweeted. There are not many verified users in this dataset so I will also be looking at users with a high amount of followers. I will try to visualize how tweets from these users spread and what factors influence this spread.
# Get a column that has all the @'d users for that tweet
origin['mentions'] = origin.text.str.split().apply(lambda lst: [(x[1:-1] if x.endswith(':') else x[1:]) for x in filter(lambda x: x.startswith('@'), lst)])
origin.head()
cols = ['username', 'followers_count', 'retweet_count', 'mentions']
# Aggregate users to count their average num of followers, total retweets, and total mentions
graph_df = origin[cols].groupby('username').agg({'followers_count': 'mean', 'retweet_count': 'sum', 'mentions': 'sum'}).reset_index()
# Will use follower count to represent node size; scale them between sizes [1000, 51000] for the network
graph_df['scaled_count'] = 1000 + (graph_df.followers_count - graph_df.followers_count.min()) * 50000 / (graph_df.followers_count.max() - graph_df.followers_count.min())
# Explode df on mentions to add in edges
graph_df = graph_df.explode('mentions')
# Add underscore to usernames and mentions so they aren't treated as ints by the library
graph_df.username = graph_df.username + '_'
graph_df.mentions = graph_df.mentions + '_'
# Calculate hex color codes so that higher retweet count corresponds to darker red node
norm = Normalize(vmin=0, vmax=5000, clip=True)
mapper = plt.cm.ScalarMappable(norm=norm, cmap=plt.cm.Reds)
graph_df['colors'] = graph_df.retweet_count.apply(lambda x: mcolors.to_hex(mapper.to_rgba(x)))
graph_df.head()
# More red = More retweets, Larger = More followers
nodes = graph_df.head(500).drop_duplicates('username')
mentions = [x for x in graph_df.head(500).mentions.dropna() if x not in nodes.username]
title_col = 'Followers: ' + nodes.followers_count.astype(int).astype(str) + '\nRetweets: ' + nodes.retweet_count.astype(str)
nt = Network(height=800, width='100%', notebook=True, directed=True)
nt.add_nodes(nodes.username.to_numpy(), title=title_col, value=nodes.scaled_count, color=nodes.colors)
nt.add_nodes(mentions, title=mentions, value=[50] * len(mentions), color=['#FFFFFF'] * len(mentions))
nt.add_edges(graph_df[['username', 'mentions']].head(500).dropna().to_records(index=False))
nt.show('pen_network.html')
I will be using the VADER Sentiment Analysis library since it works well on emojis and slang. This library also uses NLTK under the hood for parsing out stop words. This will generate a score for each tweet from -1 to 1.
sent = SentimentIntensityAnalyzer()
origin['sentiment'] = origin.text.apply(lambda s: sent.polarity_scores(s)['compound'])
origin.head()
# Five worst sentiment tweets
list(origin.sort_values('sentiment').text[:5])
# Five best sentiment tweets
list(origin.sort_values('sentiment', ascending=False).text[:5])
# Five tweets with most neutral sentiment
list(origin.sort_values('sentiment', key=abs).text[:5])
origin.sentiment.plot.kde()
This sentiment analysis is not perfect. It definitely does a better job detecting meaner tweets, but it seems to rely heavily on the type of emojis used since the top five all have angry emojis. The highest sentiment tweets are still quite Islamaphobic, but this is to be expected since this dataset primarily includes Islamaphobic tweets. Looking at the KDE plot of sentiment distribution, most tweets have been tagged with neutral sentiment. There is a higher peak of negative tweets in comparison to positive tweets which is a good sanity check.
This type of visualization helps with trends over time. We can see what trends there are amongst the hashtags, whether that is popularity, distribution over time, as well as overall cumulation of the hashtags.
df = df.drop_duplicates()
df_na = df.dropna(axis=1, thresh=.75)
table = defaultdict(int)
for entity in df['entities']:
test = json.loads(entity)
for x in test['hashtags']:
table[x['text'].lower()] += 1
We want to create an index in the dataframe that is solely dates for the library to run against.
df_datetime = pd.to_datetime(df['created_at']).dt.date
df_datetime.head()
Through this distribution, we can see an overall downward trend in islamaphobic tweets, where spikes are correlated with terrorist attacks that have muslim backgrounds (such as ISIS). For example, in November of 2017, the highest influx of islamophobic tweets occured after the terrorist car attack that happened in New York City.
df_datetime.hist(figsize=(10,10), bins=100)
plt.show()
table = dict(sorted(table.items(), key=lambda item: item[1], reverse=True))
Counter(table).most_common(10)
Create a new dataframe for the bar chart race with an index created on the date and the entities
column as one of the columns in the dataframe, that holds the hashtags in them
new_df = df.copy()
new_df['date'] = pd.to_datetime(new_df['created_at']).dt.date
new_df.index = new_df['date']
new_df = new_df[['entities']]
new_df.head()
datetime_dict = defaultdict(lambda: defaultdict(dict))
def get_counts(entity):
table = defaultdict(int)
test = json.loads(entity)
for x in test['hashtags']:
table[x['text'].lower()] += 1
return table
for idx, row in tqdm(new_df.iterrows()):
datetime_dict[idx] = dict(Counter(datetime_dict[idx]) + Counter(get_counts(row['entities'])))
The bar chart race video shows some pretty revealing revelations. Most of the islamophobic sentiments have the hashtag #banIslam in them, while as the rest of the hashtags seems to influx as time goes on. Furthermore, another pattern is how the frequency of the tweets start to slow down as time goes on, showing that twitter seems to be hampering down on racial charge tweets as time goes on.
# Renders a 5 second video
'''
df_bar_chart_race = pd.DataFrame.from_dict(datetime_dict,orient='index').fillna(value=0).cumsum()
bcr.bar_chart_race(df = df_bar_chart_race.head(),
n_bars=5,
title = "Popular Hashtags (2017-2021)",
period_length=250,
bar_kwargs={'alpha': .7},
bar_label_size=7)
'''
This classification model is in the beginning phases, but through simply passing in the most popular hashtags found in the 2 datasets given to us, I am able to flag tweets in live time (or upto a given date), and it is very evident that racially charged tweets are still very prevelant in our society. Through some more NLP on the classification, I hope to flag down tweets at a higher and more efficient rate than the flagging I am doing right now. This live flagging is simply a tool to monitor all the analysis we have provided above that help identify the problem.
# grab the hashtags we want to track
table = defaultdict(int)
for entity in df['entities']:
test = json.loads(entity)
for x in test['hashtags']:
table[x['text'].lower()] += 1
table = Counter(table)
table = dict(table.most_common(100))
# Remove hashtags that won't hold relevance in flagging
remove = ['islam', 'muslim', 'religionofpeace', 'uk', 'maga', 'britain', 'muslims', 'buildthewall', 'jihad', 'americafirst', 'religionbeliefs', 'patriotic', 'america', 'wakeupamerica', 'islamic', 'rt', 'stopcarnage', 'pvv', 'kag', 'breaking', 'makedclisten', 'christian',
'educateyourselfonislam', 'tcot', 'terror','freetommy', 'allah', 'migrant', 'usa', 'trump2020', 'travelban', 'us', 'freetommyrobinson', 'rape', 'immigrationreform', 'bannogozones', 'france', 'lilbulli', 'germany','draintheswamp', 'canada', 'europe','cspi','pakistan','trump','veterans', 'trumptrain', 'iran', 'bancair','ramadan', 'closernation','walkaway', 'tocatchathief', 'minnesota', 'wwg1wga', 'potus', 'hamas', 'quran', 'trudeaumustgo', 'murder','ovc16', 'sweden', 'christians',
'police', 'israel', 'isis']
[table.pop(key) for key in remove]
Counter(table).most_common(15)
string = " OR ".join(list(table.keys())[:10])
print(string)
Notice how recent these tweets are, through some further digging, I was able to get the location of some of the users, creating a hypothesis that some of these users may be bots with the minimal amount of data associated with the username.
# Configure
c = twint.Config()
c = twint.Config()
c.Search = string
c.Since = '2021-02-13'
c.Limit = 100
c.Store_csv = True
c.Output = 'twitter2.csv'
# Run
twint.run.Search(c)
df_twitter = pd.read_csv('./twitter2.csv')
df_twitter.head()
counter = 0
for index, row in (df_twitter.iterrows()):
if df_twitter['date'][index] == "2021-02-19" and counter <20:
print(df_twitter['created_at'][index] + ' ' + df_twitter['username'][index] + ': ' + df_twitter['tweet'][index] + '\n')
counter += 1
place_df = df[df["place"].notna()].astype('str')
place_df = place_df["place"]
print(place_df)
places = place_df.apply(json.loads).apply(pd.Series)
places['full_name'] = places['full_name'].str.replace(', USA', '')
places['location'] = places['full_name'] + ", " + places['country']
places.head()
locator = Nominatim(user_agent='myGeocoder')
m = folium.Map(location=[20,0], zoom_start=2)
data = []
for index, row in places.iterrows():
location = locator.geocode(row['location'])
if (location == None):
location = locator.geocode(row['country'])
data.append([location.latitude, location.longitude, 1])
folium.Circle(
radius=400,
location=[location.latitude, location.longitude],
popup=row["name"],
color="crimson",
fill=False,
).add_to(m)
places_copy = pd.DataFrame(data, columns=['latitude', 'longitude', 'count'])
m.save('50kdotmap.html')
m
n = folium.Map(location=[20,0], zoom_start=2)
HeatMap(data=places_copy[['latitude', 'longitude', 'count']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist(), radius=10, max_zoom=10).add_to(n)
n.save('50kheatmap.html')
n
total_entries = len(df.index)
print(len(df.index))
print(len(places.index))
location_entries = len(places.index)
print(str(100*(location_entries/total_entries)) + "% of total entries have location information available.")
#Studies demonstrate that approximately 0.85% of tweets are geotagged: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4636345/"
#Study looked at a sample of 113 million tweets. As such, we can be 99% confident that that this is a statistically significant difference
#What is making this percentage so low? Most likely, there is an abundance of bots posting negative about Islam
We were able to use this data to discover a lot of relationships:
The tools we presented allow activists to take data-driven action in reaching out to specific users in specific regions to efficiently manage their efforts in combating Islamophobia.