ICWSM'16 Tutorial on Social Data Limits


Interactive Session: Data and Methodological Pitfalls

In [9]:
# Importing necessary libraries
import os
import sys
import numpy

# for handling files
import csv
import json

# for handling text
import nltk 
from nltk.corpus import stopwords

# for plotting
import matplotlib.pyplot as plt

# for pretty printing
import pprint
import prettytable

%matplotlib inline

# Data paths
dataPath = "./practice_data/"

# Check data path
if not os.path.isdir(dataPath):
  print "The given path is wrong or director does not exist"
else:
  print "All OK!"
All OK!

Excercise 1.

If you sample data according to a certain type of users, what type of data would you be missing? Conversely, if you would be able to tune your sample towards a certain type of content, from what kind of users you will be missing?

Data: Sample datasets corresponding to 6 crisis events in 2013.

In [20]:
# Define labels
notLabeled = "Not labeled" 

# Categories of sources of information during crises
# To focus the analysis on different categories of information types or sources, 
# you can set the entries to the below dictionaries to:
# True (to consider it for the analysis) or False (to ignore it)
sourceTypes = {
               "Media": True,
               "Government": True,
               "NGOs": True,
               "Business": True,
               "Outsiders": True,
               "Eyewitness": True
               }

# Categories of types of information during crises
infoTypes = {
             "Caution and advice": True,
             "Affected individuals": True,
             "Infrastructure and utilities": True,
             "Donations and volunteering": True,
             "Other Useful Information": True,
             "Sympathy and support": True
             }

# Practice datasets for Excercise 1
datasets = {
            "2013_Colorado_floods": True,
            "2013_Boston_bombings": True,
            "2013_NY_train_crash": True,
            "2013_Australia_bushfire": True, 
            "2013_LA_airport_shootings": True,
            "2013_Typhoon_Yolanda": True
            }

rowLabels = sorted([k for k in infoTypes.keys() if infoTypes[k]])
columnLabels = sorted([k for k in sourceTypes.keys() if sourceTypes[k]])
validTypes = set(rowLabels)
validSources = set(columnLabels)
dataStats = {type:{ source: 0 for source in columnLabels} for type in rowLabels}

totalTweets = 0
for event in datasets:
    if datasets[event]:
        datasetPath = dataPath + event + ".csv"
        
        # Check file
        if not os.path.isfile(datasetPath):
            print datasetPath + "is not a file or does not exist"
            
        csvFile = open(datasetPath,"r")
        reader = csv.DictReader(csvFile)
        for row in reader:
            if (row[" Information Type"] in validTypes) and (row[" Information Source"] in validSources):
                dataStats[row[" Information Type"]][row[" Information Source"]] += 1
                totalTweets += 1
        csvFile.close()

print "Tweets used for the analysis: ", totalTweets
print
print "Tweets break down by types and sources of information"
pprint.pprint(dataStats)
Tweets used for the analysis:  5535

Tweets break down by types and sources of information
{'Affected individuals': {'Business': 5,
                          'Eyewitness': 68,
                          'Government': 45,
                          'Media': 950,
                          'NGOs': 12,
                          'Outsiders': 222},
 'Caution and advice': {'Business': 6,
                        'Eyewitness': 15,
                        'Government': 91,
                        'Media': 208,
                        'NGOs': 14,
                        'Outsiders': 86},
 'Donations and volunteering': {'Business': 54,
                                'Eyewitness': 5,
                                'Government': 29,
                                'Media': 125,
                                'NGOs': 138,
                                'Outsiders': 279},
 'Infrastructure and utilities': {'Business': 6,
                                  'Eyewitness': 28,
                                  'Government': 35,
                                  'Media': 250,
                                  'NGOs': 5,
                                  'Outsiders': 57},
 'Other Useful Information': {'Business': 10,
                              'Eyewitness': 72,
                              'Government': 104,
                              'Media': 1047,
                              'NGOs': 31,
                              'Outsiders': 545},
 'Sympathy and support': {'Business': 12,
                          'Eyewitness': 29,
                          'Government': 13,
                          'Media': 74,
                          'NGOs': 12,
                          'Outsiders': 853}}
In [21]:
# Better visualize these stats

# How do you want to normalize the data? Options: total, per source, per type
normalization = "per type"

if normalization == "total":
    m_data=(numpy.array([[float(dataStats[r][c])/totalTweets for c in columnLabels] for r in rowLabels]))
elif normalization == "per source":
    m_data=(numpy.array([[float(dataStats[r][c])/sum([dataStats[x][c] for x in rowLabels]) for c in columnLabels] for r in rowLabels]))
elif normalization == "per type":
    m_data=(numpy.array([[float(dataStats[r][c])/sum(dataStats[r].values()) for c in columnLabels] for r in rowLabels]))

# Plot the interplay between message types and sources
fig = plt.figure(1,figsize=(8,4))
ax = fig.add_subplot(111)
heatmap = ax.pcolor(m_data, cmap='gist_earth_r')
heatmap.set_clim(vmin=0,vmax=0.3)

# Put the major ticks at the middle of each cell
ax.set_xticks(numpy.arange(m_data.shape[1]) + 0.5, minor=False)
ax.set_yticks(numpy.arange(m_data.shape[0]) + 0.5, minor=False)
ax.set_yticklabels(rowLabels, fontsize=12, minor=False)
ax.set_xticklabels(columnLabels, fontsize=12, minor=False)
cbar = fig.colorbar(heatmap, ticks=[0, 0.05, 0.1, 0.15, 0.2, 0.25])
cbar.ax.set_yticklabels(['0', '5%', '10%', '15%', '20%', "25%"], fontsize=11, minor=False)

plt.xticks(rotation=90)
plt.subplots_adjust(left = 0.23, right = 1, top = 0.99, bottom = 0.23, hspace = 0.2, wspace = 0.1)

if normalization == "total":
    print "Note: Cells add up to 100%"
elif normalization == "per source":
    print "Note: Each column adds up to 100%"
elif normalization == "per type":
    print "Note: Each raw adds up to 100%"
    
plt.show()
Note: Each raw adds up to 100%

Excercise 2.

How much do data samples changes if you use different sets of keywords? Do you also spot mistakes in labels?

Data: Sample dataset with tweets posted during Hurricane Sandy in 2012 (50% collected with 4 keywords: hurricane, hurricane sandy, frankenstorm, #sandy; and 50% geo-located within the areas affected by Hurricane Sandy around New York City)

In [12]:
# Auxiliar functions

## Pre-process tweets
def get_term_list(message):
    tweet_terms = []
    tokenizer = nltk.RegexpTokenizer('\#?[\w\d]+')
    doc = tokenizer.tokenize(unicode(message, "utf-8"))
    for w_raw in doc:
        w = (w_raw.strip('\"\'.,;?!:)(@/*&')).lower()
        if (w not in stopwords.words('english') and w not in set(['rt','http','amp'])) and len(w) in range(3, 16):
            tweet_terms.append(w)
    return tweet_terms

## Checks if message has at least one of the keywords from a list
def match(message, keywords):
    terms = get_term_list(message)
    for l in keywords:
        ls = l.split()
        if len(ls) == 2:
            if (ls[0] in terms or '#'+ls[0] in terms) and (ls[1] in terms or '#'+ls[1] in terms):
                return True
        elif l in terms or '#'+l in terms:
            return True
    return False

# Define labels & sample criteria
labels = ["on-topic", "off-topic"]

# What keywords do you want to use for sampling?  
# Play first with the collection keywords: hurricane, hurricane sandy, frankenstorm, #sandy
keywordsList = ["storm",]

baseDistribution = {l:0 for l in labels}
sampleDistribution = {l:0 for l in labels}

# Read & sample the data from:
baseSamplePath = dataPath + "2012_Sandy_Hurricane-labeled.csv"

# Check file
if not os.path.isfile(datasetPath):
    print datasetPath + "is not a file or does not exist"
            
csvFile = open(baseSamplePath,"r")
reader = csv.DictReader(csvFile)
for row in reader:
    baseDistribution[row[" label"]]+=1
    if match(row[" tweet"], keywordsList):
        sampleDistribution[row[" label"]]+=1
        
        # You can uncomment the three lines below to see matched tweets that were labeled as off-topic
        #print "Messages matched by the keywords, but labeled as unrelated. Do you spot mistakes?"
        #if row[" label"] == "off-topic":
        #    print row[" tweet"]
csvFile.close()

truePositive = sampleDistribution["on-topic"]
falsePositive = sampleDistribution["off-topic"] 
trueNegative = baseDistribution["off-topic"]-sampleDistribution["off-topic"]
falseNegative = baseDistribution["on-topic"]-sampleDistribution["on-topic"] 


precision = 100*float(truePositive)/(truePositive + falsePositive)
recall = 100*float(truePositive)/baseDistribution["on-topic"]

print 
print "List of keywords used to sample: ", keywordsList
print "Precision: %.2f" % precision
print "Recall: %.2f" % recall

# Put results in a nice table as well
rTable = prettytable.PrettyTable()

rTable.field_names = [" ", "Matched", "Not matched", "Total"]
rTable.add_row(["On-topic", truePositive, falseNegative, 
              truePositive + falseNegative])
rTable.add_row(["Off-topic", falsePositive, trueNegative, 
              falsePositive + trueNegative ])
rTable.add_row(["Total", falsePositive + truePositive, falseNegative + trueNegative, 
              falsePositive + trueNegative + truePositive + falseNegative])

print rTable
List of keywords used to sample:  ['storm']
Precision: 93.52
Recall: 3.76
+-----------+---------+-------------+-------+
|           | Matched | Not matched | Total |
+-----------+---------+-------------+-------+
|  On-topic |   231   |     5907    |  6138 |
| Off-topic |    16   |     3854    |  3870 |
|   Total   |   247   |     9761    | 10008 |
+-----------+---------+-------------+-------+

Excercise 3.

Given a set of users posting on a given issue (e.g. using a given hashtag), would they talk about the same topics irrespective of their demographics? What other participation patterns might change?

Data: Anonymized sample dataset of users tweeting about the #BlackLiveMatter movement, along with their demographics and other characteristics.

In [13]:
# Define labels
notLabeledUser = None

# To focus the analysis on different categories of each demographic criteria, 
# you can set the entries to the below dictionaries to:
# True (to consider it for the analysis) or False (to ignore it)
age = {
       "Unknown": False,
       "Young Adult: Between 18 to 29 years old": True,
       "Adult: Between 30 to 64 years old": True,
       "Elderly: 65 years or older": True,
       "Child: 17 years or younger": False
       }

gender = {
          "Unknown": False,
          "Female": True,
          "Male": True,
          "Both": False,
          }

race = {
        "Unknown": False,
        "White": True,
        "Black": True,
        "Asian": True,
        "Other racial group": True
        }

type = {
        "organization": True,
        "individual": True
        }

# Read list of users
userSamplePath = dataPath + "blacklivesmatter_anon.json"

userList = {}
with open(userSamplePath) as f:
    for line in f:
        u = json.loads(line)
        userList[u["user"]]=u["details"]
    print "The following fields are available for all users:"
    print " ".join([k+"\n" for k in u["details"].keys()])
The following fields are available for all users:
gender
 age
 top_terms
 topic_tweets
 race
 followers
 favorites
 type
 tweets
 friends

In [14]:
# Let's ask a few simple questions
## 1. What are the top 15 terms related to the #BlackLivesMatter for each race group? 
##    Are some groups more active than others?

criteria = gender # to use a different criteria, just change the dictionary of labels & the field (e.g. age, gender, type)
criteria_field = "gender"

terms = {c:[] for c in criteria if criteria[c]}
activity = {c:{"users": 0, "tweets":0} for c in criteria if criteria[c]}
for u in userList:
    if userList[u][criteria_field]!=notLabeledUser and criteria[userList[u][criteria_field]]:
        terms[userList[u][criteria_field]]+=userList[u]["top_terms"]
        activity[userList[u][criteria_field]]["users"]+=1
        activity[userList[u][criteria_field]]["tweets"]+=userList[u]["topic_tweets"]

for c in terms:
    dist = nltk.FreqDist(terms[c])
    print "For (%s) criteria we have %s users and %0.2f tweets on average" % (
                                                                              c, 
                                                                              activity[c]["users"], 
                                                                              float(activity[c]["tweets"])/activity[c]["users"]
                                                                              )
    print "Top 15 terms for (%s) users are:" % c
    #print "".join(["%s, %s \n"% (t[0], t[1]) for t in dist.most_common(15)])
    dist.plot(15)
For (Male) criteria we have 628 users and 2.65 tweets on average
Top 15 terms for (Male) users are:
For (Female) criteria we have 779 users and 3.90 tweets on average
Top 15 terms for (Female) users are:
In [15]:
## 2. Check also if there are differences regarding general Twitter attributes w.r.t. user demographics:

attrToCheck = "tweets" # you can also change this to: followers, favorites or friends
attrStats = {c:{"users": 0, "attr":0} for c in criteria if criteria[c]}

for u in userList:
    if userList[u][criteria_field]!=notLabeledUser and criteria[userList[u][criteria_field]]:
        attrStats[userList[u][criteria_field]]["users"]+=1
        attrStats[userList[u][criteria_field]]["attr"]+=userList[u][attrToCheck]
        
for c in attrStats:
    print "For (%s) category we have %s users and %0.2f %s on average" % (
                                                                          c, 
                                                                          attrStats[c]["users"], 
                                                                          float(attrStats[c]["attr"])/attrStats[c]["users"],
                                                                          attrToCheck
                                                                          )
For (Male) category we have 628 users and 15699.66 tweets on average
For (Female) category we have 779 users and 16301.34 tweets on average

Excercise 4. Using the same data sample, explore what demographic criteria seems to be the easiest to identify based on social media profiles.

Note: Annotators were shown automatically generated screenshoots of the upper part of users’ public profiles, including the picture banner, the profile picture, the name and profile description, and their last one or two tweets.

In [16]:
annotationCriteria = {"race":{"total":0, "labeled":0, "users":set()}, 
                      "age":{"total":0, "labeled":0, "users":set()}, 
                      "gender":{"total":0, "labeled":0, "users":set()}}

for u in userList:
    #ignore accounts of organizations or those og whose profiles were not correctly displayed
    if userList[u]["type"]=="organization" or userList[u]["type"]==notLabeledUser:
        continue
    for a in annotationCriteria:
        annotationCriteria[a]["total"]+=1
        if userList[u][a] != "Unknown":
            annotationCriteria[a]["labeled"]+=1
            annotationCriteria[a]["users"].add(u)

print "From all the examples of individual accounts (%s), we were able to annotate according to:"%annotationCriteria["race"]["total"] 
for a in annotationCriteria:
    print "%s %0.2f%% accounts"% (a, 100*float(annotationCriteria[a]["labeled"])/annotationCriteria[a]["total"])   

all = len(annotationCriteria["race"]["users"] & annotationCriteria["age"]["users"] & annotationCriteria["gender"]["users"])
print "all criteria %0.2f%% accounts"% (100*float(all)/annotationCriteria[a]["total"])

print
print    
print "The END!"
From all the examples of individual accounts (1657), we were able to annotate according to:
gender 86.54% accounts
age 78.52% accounts
race 80.02% accounts
all criteria 77.49% accounts


The END!