# Importing necessary libraries
import os
import sys
import numpy
# for handling files
import csv
import json
# for handling text
import nltk
from nltk.corpus import stopwords
# for plotting
import matplotlib.pyplot as plt
# for pretty printing
import pprint
import prettytable
%matplotlib inline
# Data paths
dataPath = "./practice_data/"
# Check data path
if not os.path.isdir(dataPath):
print "The given path is wrong or director does not exist"
else:
print "All OK!"
Excercise 1.
If you sample data according to a certain type of users, what type of data would you be missing? Conversely, if you would be able to tune your sample towards a certain type of content, from what kind of users you will be missing?
Data: Sample datasets corresponding to 6 crisis events in 2013.
# Define labels
notLabeled = "Not labeled"
# Categories of sources of information during crises
# To focus the analysis on different categories of information types or sources,
# you can set the entries to the below dictionaries to:
# True (to consider it for the analysis) or False (to ignore it)
sourceTypes = {
"Media": True,
"Government": True,
"NGOs": True,
"Business": True,
"Outsiders": True,
"Eyewitness": True
}
# Categories of types of information during crises
infoTypes = {
"Caution and advice": True,
"Affected individuals": True,
"Infrastructure and utilities": True,
"Donations and volunteering": True,
"Other Useful Information": True,
"Sympathy and support": True
}
# Practice datasets for Excercise 1
datasets = {
"2013_Colorado_floods": True,
"2013_Boston_bombings": True,
"2013_NY_train_crash": True,
"2013_Australia_bushfire": True,
"2013_LA_airport_shootings": True,
"2013_Typhoon_Yolanda": True
}
rowLabels = sorted([k for k in infoTypes.keys() if infoTypes[k]])
columnLabels = sorted([k for k in sourceTypes.keys() if sourceTypes[k]])
validTypes = set(rowLabels)
validSources = set(columnLabels)
dataStats = {type:{ source: 0 for source in columnLabels} for type in rowLabels}
totalTweets = 0
for event in datasets:
if datasets[event]:
datasetPath = dataPath + event + ".csv"
# Check file
if not os.path.isfile(datasetPath):
print datasetPath + "is not a file or does not exist"
csvFile = open(datasetPath,"r")
reader = csv.DictReader(csvFile)
for row in reader:
if (row[" Information Type"] in validTypes) and (row[" Information Source"] in validSources):
dataStats[row[" Information Type"]][row[" Information Source"]] += 1
totalTweets += 1
csvFile.close()
print "Tweets used for the analysis: ", totalTweets
print
print "Tweets break down by types and sources of information"
pprint.pprint(dataStats)
# Better visualize these stats
# How do you want to normalize the data? Options: total, per source, per type
normalization = "per type"
if normalization == "total":
m_data=(numpy.array([[float(dataStats[r][c])/totalTweets for c in columnLabels] for r in rowLabels]))
elif normalization == "per source":
m_data=(numpy.array([[float(dataStats[r][c])/sum([dataStats[x][c] for x in rowLabels]) for c in columnLabels] for r in rowLabels]))
elif normalization == "per type":
m_data=(numpy.array([[float(dataStats[r][c])/sum(dataStats[r].values()) for c in columnLabels] for r in rowLabels]))
# Plot the interplay between message types and sources
fig = plt.figure(1,figsize=(8,4))
ax = fig.add_subplot(111)
heatmap = ax.pcolor(m_data, cmap='gist_earth_r')
heatmap.set_clim(vmin=0,vmax=0.3)
# Put the major ticks at the middle of each cell
ax.set_xticks(numpy.arange(m_data.shape[1]) + 0.5, minor=False)
ax.set_yticks(numpy.arange(m_data.shape[0]) + 0.5, minor=False)
ax.set_yticklabels(rowLabels, fontsize=12, minor=False)
ax.set_xticklabels(columnLabels, fontsize=12, minor=False)
cbar = fig.colorbar(heatmap, ticks=[0, 0.05, 0.1, 0.15, 0.2, 0.25])
cbar.ax.set_yticklabels(['0', '5%', '10%', '15%', '20%', "25%"], fontsize=11, minor=False)
plt.xticks(rotation=90)
plt.subplots_adjust(left = 0.23, right = 1, top = 0.99, bottom = 0.23, hspace = 0.2, wspace = 0.1)
if normalization == "total":
print "Note: Cells add up to 100%"
elif normalization == "per source":
print "Note: Each column adds up to 100%"
elif normalization == "per type":
print "Note: Each raw adds up to 100%"
plt.show()
Excercise 2.
How much do data samples changes if you use different sets of keywords? Do you also spot mistakes in labels?
Data: Sample dataset with tweets posted during Hurricane Sandy in 2012 (50% collected with 4 keywords: hurricane, hurricane sandy, frankenstorm, #sandy; and 50% geo-located within the areas affected by Hurricane Sandy around New York City)
# Auxiliar functions
## Pre-process tweets
def get_term_list(message):
tweet_terms = []
tokenizer = nltk.RegexpTokenizer('\#?[\w\d]+')
doc = tokenizer.tokenize(unicode(message, "utf-8"))
for w_raw in doc:
w = (w_raw.strip('\"\'.,;?!:)(@/*&')).lower()
if (w not in stopwords.words('english') and w not in set(['rt','http','amp'])) and len(w) in range(3, 16):
tweet_terms.append(w)
return tweet_terms
## Checks if message has at least one of the keywords from a list
def match(message, keywords):
terms = get_term_list(message)
for l in keywords:
ls = l.split()
if len(ls) == 2:
if (ls[0] in terms or '#'+ls[0] in terms) and (ls[1] in terms or '#'+ls[1] in terms):
return True
elif l in terms or '#'+l in terms:
return True
return False
# Define labels & sample criteria
labels = ["on-topic", "off-topic"]
# What keywords do you want to use for sampling?
# Play first with the collection keywords: hurricane, hurricane sandy, frankenstorm, #sandy
keywordsList = ["storm",]
baseDistribution = {l:0 for l in labels}
sampleDistribution = {l:0 for l in labels}
# Read & sample the data from:
baseSamplePath = dataPath + "2012_Sandy_Hurricane-labeled.csv"
# Check file
if not os.path.isfile(datasetPath):
print datasetPath + "is not a file or does not exist"
csvFile = open(baseSamplePath,"r")
reader = csv.DictReader(csvFile)
for row in reader:
baseDistribution[row[" label"]]+=1
if match(row[" tweet"], keywordsList):
sampleDistribution[row[" label"]]+=1
# You can uncomment the three lines below to see matched tweets that were labeled as off-topic
#print "Messages matched by the keywords, but labeled as unrelated. Do you spot mistakes?"
#if row[" label"] == "off-topic":
# print row[" tweet"]
csvFile.close()
truePositive = sampleDistribution["on-topic"]
falsePositive = sampleDistribution["off-topic"]
trueNegative = baseDistribution["off-topic"]-sampleDistribution["off-topic"]
falseNegative = baseDistribution["on-topic"]-sampleDistribution["on-topic"]
precision = 100*float(truePositive)/(truePositive + falsePositive)
recall = 100*float(truePositive)/baseDistribution["on-topic"]
print
print "List of keywords used to sample: ", keywordsList
print "Precision: %.2f" % precision
print "Recall: %.2f" % recall
# Put results in a nice table as well
rTable = prettytable.PrettyTable()
rTable.field_names = [" ", "Matched", "Not matched", "Total"]
rTable.add_row(["On-topic", truePositive, falseNegative,
truePositive + falseNegative])
rTable.add_row(["Off-topic", falsePositive, trueNegative,
falsePositive + trueNegative ])
rTable.add_row(["Total", falsePositive + truePositive, falseNegative + trueNegative,
falsePositive + trueNegative + truePositive + falseNegative])
print rTable
Excercise 3.
Given a set of users posting on a given issue (e.g. using a given hashtag), would they talk about the same topics irrespective of their demographics? What other participation patterns might change?
Data: Anonymized sample dataset of users tweeting about the #BlackLiveMatter movement, along with their demographics and other characteristics.
# Define labels
notLabeledUser = None
# To focus the analysis on different categories of each demographic criteria,
# you can set the entries to the below dictionaries to:
# True (to consider it for the analysis) or False (to ignore it)
age = {
"Unknown": False,
"Young Adult: Between 18 to 29 years old": True,
"Adult: Between 30 to 64 years old": True,
"Elderly: 65 years or older": True,
"Child: 17 years or younger": False
}
gender = {
"Unknown": False,
"Female": True,
"Male": True,
"Both": False,
}
race = {
"Unknown": False,
"White": True,
"Black": True,
"Asian": True,
"Other racial group": True
}
type = {
"organization": True,
"individual": True
}
# Read list of users
userSamplePath = dataPath + "blacklivesmatter_anon.json"
userList = {}
with open(userSamplePath) as f:
for line in f:
u = json.loads(line)
userList[u["user"]]=u["details"]
print "The following fields are available for all users:"
print " ".join([k+"\n" for k in u["details"].keys()])
# Let's ask a few simple questions
## 1. What are the top 15 terms related to the #BlackLivesMatter for each race group?
## Are some groups more active than others?
criteria = gender # to use a different criteria, just change the dictionary of labels & the field (e.g. age, gender, type)
criteria_field = "gender"
terms = {c:[] for c in criteria if criteria[c]}
activity = {c:{"users": 0, "tweets":0} for c in criteria if criteria[c]}
for u in userList:
if userList[u][criteria_field]!=notLabeledUser and criteria[userList[u][criteria_field]]:
terms[userList[u][criteria_field]]+=userList[u]["top_terms"]
activity[userList[u][criteria_field]]["users"]+=1
activity[userList[u][criteria_field]]["tweets"]+=userList[u]["topic_tweets"]
for c in terms:
dist = nltk.FreqDist(terms[c])
print "For (%s) criteria we have %s users and %0.2f tweets on average" % (
c,
activity[c]["users"],
float(activity[c]["tweets"])/activity[c]["users"]
)
print "Top 15 terms for (%s) users are:" % c
#print "".join(["%s, %s \n"% (t[0], t[1]) for t in dist.most_common(15)])
dist.plot(15)
## 2. Check also if there are differences regarding general Twitter attributes w.r.t. user demographics:
attrToCheck = "tweets" # you can also change this to: followers, favorites or friends
attrStats = {c:{"users": 0, "attr":0} for c in criteria if criteria[c]}
for u in userList:
if userList[u][criteria_field]!=notLabeledUser and criteria[userList[u][criteria_field]]:
attrStats[userList[u][criteria_field]]["users"]+=1
attrStats[userList[u][criteria_field]]["attr"]+=userList[u][attrToCheck]
for c in attrStats:
print "For (%s) category we have %s users and %0.2f %s on average" % (
c,
attrStats[c]["users"],
float(attrStats[c]["attr"])/attrStats[c]["users"],
attrToCheck
)
Excercise 4. Using the same data sample, explore what demographic criteria seems to be the easiest to identify based on social media profiles.
Note: Annotators were shown automatically generated screenshoots of the upper part of users’ public profiles, including the picture banner, the profile picture, the name and profile description, and their last one or two tweets.
annotationCriteria = {"race":{"total":0, "labeled":0, "users":set()},
"age":{"total":0, "labeled":0, "users":set()},
"gender":{"total":0, "labeled":0, "users":set()}}
for u in userList:
#ignore accounts of organizations or those og whose profiles were not correctly displayed
if userList[u]["type"]=="organization" or userList[u]["type"]==notLabeledUser:
continue
for a in annotationCriteria:
annotationCriteria[a]["total"]+=1
if userList[u][a] != "Unknown":
annotationCriteria[a]["labeled"]+=1
annotationCriteria[a]["users"].add(u)
print "From all the examples of individual accounts (%s), we were able to annotate according to:"%annotationCriteria["race"]["total"]
for a in annotationCriteria:
print "%s %0.2f%% accounts"% (a, 100*float(annotationCriteria[a]["labeled"])/annotationCriteria[a]["total"])
all = len(annotationCriteria["race"]["users"] & annotationCriteria["age"]["users"] & annotationCriteria["gender"]["users"])
print "all criteria %0.2f%% accounts"% (100*float(all)/annotationCriteria[a]["total"])
print
print
print "The END!"