from urllib2 import Request, urlopen
from urlparse import urlparse, urlunparse
import requests, requests_cache
import pandas as pd
import json
from wordcloud import WordCloud
from matplotlib import pyplot as plt
plt.style.use('ggplot')
from bs4 import BeautifulSoup
import re
from os import listdir
from os import chdir
from os.path import isfile, join
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np
import string
from textblob import TextBlob
import pylab
from __future__ import division
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import itertools
from collections import Counter
We first want to read all of the text files we have from web scraping.
def read_text(filename):
"""
This function reads a txt file with filtered or unfiltered list of titles
Input: A txt file
Output: Filtered list of titles
"""
title_names = []
text_file = open(filename,"r")
for line in text_file:
line = line.strip()
title_names.append(line)
return title_names
Function that takes the list of lists of titles and returns the noun polarity and subjectivity.
def noun_polarity(total_list):
"""
This function takes in a list of titles
and separates the titles in noun phrases
and returns the polarity and subjectivity
Input: List of Article Titles
Output: Subjectivity and Polarity of Noun Phrases
"""
noun_phrases = []
for title in total_list:
blob = TextBlob(title)
noun_phrases.extend(blob.noun_phrases)
noun_polarity = []
noun_subjectivity = []
for noun in noun_phrases:
blob = TextBlob(noun)
noun_polarity.append(blob.sentiment.polarity)
noun_subjectivity.append(blob.sentiment.subjectivity)
return noun_polarity, noun_subjectivity
Function that takes in the total polarity, list name, type of news, and length of list and returns the total polarity level.
def polarity_levels(total_pol, name, typenews, num):
"""
This function takes in all the polarity and gets
the ratio of polarity based on the news outlet
Input: Polarity, Name, Category, and number of articles
Output: List consisting of the name, negative and positive
polarity ratio, category
"""
pos_pol = [i for i in total_pol if i > 0]
pos_pol = str(len(pos_pol) / num)
neg_pol = [i for i in total_pol if i < 0]
neg_pol = str(len(neg_pol) / num)
all_pol = [name, neg_pol, pos_pol, typenews]
return all_pol
def mean(numbers):
return float(sum(numbers)) / max(len(numbers), 1)
Takes in the article titles and returns the polarity of the titles (not just the nouns)
#see how pos/neg the titles are
def polar_subj(article_titles):
"""
Takes each article title and determines the polarity & subjectivity
Input: list of article titles (list)
Output: title_polarity (list), title_subj (list)
"""
title_polarity = []
title_subj = []
for title in article_titles:
blob = TextBlob(title)
title_polarity.append(blob.sentiment.polarity)
title_subj.append(blob.sentiment.subjectivity)
return title_polarity, title_subj
huff = read_text("huffpost.txt")
abc = read_text("abc.txt")
economist = read_text("economist.txt")
fox = read_text("fox.txt")
npr = read_text("npr.txt")
nytimes = read_text("nytimes.txt")
washington = read_text("washington.txt")
wsj = read_text("wsj.txt")
breitbart = read_text("breitbart.txt")
everything = [huff, abc, economist, fox, npr, nytimes,washington, wsj, breitbart]
huff_noun_pol, huff_noun_sub = noun_polarity(huff)
abc_noun_pol, abc_noun_sub = noun_polarity(abc)
economist_noun_pol, economist_noun_sub = noun_polarity(economist)
fox_noun_pol, fox_noun_sub = noun_polarity(fox)
npr_noun_pol, npr_noun_sub = noun_polarity(npr)
nytimes_noun_pol, nytimes_noun_sub = noun_polarity(nytimes)
washington_noun_pol, washington_noun_sub = noun_polarity(washington)
wsj_noun_pol, wsj_noun_sub = noun_polarity(wsj)
breitbart_noun_pol, breitbart_noun_sub = noun_polarity(breitbart)
hp = polarity_levels(huff_noun_pol, "Huffington Post", "Liberal", len(huff))
ab = polarity_levels(abc_noun_pol, "ABC News", "Liberal", len(abc))
ec = polarity_levels(economist_noun_pol, "Economist", "Other", len(economist))
fo = polarity_levels(fox_noun_pol, "Fox", "Conservative", len(fox))
np = polarity_levels(npr_noun_pol, "NPR", "Other", len(npr))
ny = polarity_levels(nytimes_noun_pol, "NYTimes", "Liberal", len(nytimes))
wa = polarity_levels(washington_noun_pol, "Washington Post", "Liberal", len(washington))
ws = polarity_levels(wsj_noun_pol, "WSJ", "Other", len(wsj))
br = polarity_levels(breitbart_noun_pol, "Breitbart", "Conservative", len(breitbart))
df = pd.DataFrame([hp, ab, ec, fo, np, ny, wa, ws, br])
df.columns = ['News Outlet', 'Negative', 'Positive', 'Category']
df
The dataframe above has 4 columns. First column is the name of the newspaper, second is negative polarity, third is the positive polarity, and the fourth column is the type (liberal, conservative, or other).
df_conservative = df[df['Category'] == "Conservative"]
df_conservative
df_other = df[df['Category'] == "Other"]
df_other
df_liberal = df[df['Category'] == "Liberal"]
df_liberal
lib1 = sum(float(item) for item in df_liberal['Negative'])
con1 = sum(float(item) for item in df_conservative['Negative'])
oth1 = sum(float(item) for item in df_other['Negative'])
lib2 = sum(float(item) for item in df_liberal['Positive'])
con2 = sum(float(item) for item in df_conservative['Positive'])
oth2 = sum(float(item) for item in df_other['Positive'])
total_other = [oth1, oth2]
total_con = [con1, con2]
total_lib = [lib1, lib2]
import numpy as np
ind = np.arange(2)
ind = np.arange(2)
width = 0.15
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_conservative.iloc[0, 1:3].values, width, color='b')
rects1 = ax.bar(ind, df_conservative.iloc[0, 1:3].values, width, color='#b3d1ff')
rects2 = ax.bar(ind+width, df_conservative.iloc[1, 1:3].values, width)
ax.set_ylabel('Percent of Polarity Occurance')
ax.set_xticks(ind+width)
ax.set_title("Polarity Between Conservative News Outlets")
ax.set_xticklabels( ('Negative Polarity', 'Positive Polarity') )
ax.legend( (rects1[0], rects2[0]), ('Fox', 'Breitbart'), bbox_to_anchor=(1, 1), loc=2)
plt.show()
The graph above shows the polarity between the conservative news outlets. We see that Fox and Breitbart have about the same percent of negativity in their news relating to Trump. However, Breitbart is significantly more positive than Fox. This makes a lot of sense because Breitbart is known to be very pro Trump.
width = 0.15
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_liberal.iloc[0, 1:3].values, width, color='#6666ff')
rects2 = ax.bar(ind+width, df_liberal.iloc[1, 1:3].values, width)
rects3 = ax.bar(ind+width+width, df_liberal.iloc[2, 1:3].values, width, color = 'g')
rects4 = ax.bar(ind+width+width+width, df_liberal.iloc[3, 1:3].values, width, color = '#00b3b3')
ax.set_ylabel('Percent of Polarity Occurance')
ax.set_xticks(ind+width)
ax.set_title("Polarity Between Liberal News Outlets")
ax.set_xticklabels( ('Negative Polarity', 'Positive Polarity') )
ax.legend( (rects1[0], rects2[0], rects3[0], rects4[0]), ('Huffington Post', 'ABC News', 'NYTime', 'Washington Post'), bbox_to_anchor=(1, 1), loc=2)
plt.show()
This graph (above)shows the polarity percentage difference between the liberal news outlets. We see that Huffington post is the highest with both positive and negative polarity. NYTimes had the lowest percent polarity for negative occurances and positive occurances. ABC news is similar to Huffington post in the way that it also has about equal percentage between positive and negative polarity. Washington Post is the only one of the 4 liberal news outlets that had more overall positive polarity than negative polarity.
width = 0.15
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_other.iloc[0, 1:3].values, width, color='#80aaff')
rects2 = ax.bar(ind+width, df_other.iloc[1, 1:3].values, width)
rects3 = ax.bar(ind+width+width, df_other.iloc[2, 1:3].values, width, color = '#d279a6')
ax.set_ylabel('Percent of Polarity Occurance')
ax.set_xticks(ind+width)
ax.set_title("Polarity Between Other News Outlets")
ax.set_xticklabels( ('Negative Polarity', 'Positive Polarity') )
ax.legend( (rects1[0], rects2[0], rects3[0]), ('Economist', 'NPR', 'WSJ'), bbox_to_anchor=(1, 1), loc=2)
plt.show()
Overall, the polarity between the other news outlets were more positive than negative. They were all relatively similar (between positive and negative) which can lead us to believe that the other news outlets are somewhat neutral in their reports. There aws however more positive nouns than negative nouns (polarity percentage) across all 3 news outlets.
width = 0.15
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, total_other, width, color='#80aaff')
rects2 = ax.bar(ind+width, total_lib, width)
rects3 = ax.bar(ind+width+width, total_con, width, color = '#bf80ff')
ax.set_ylabel('Percent of Polarity Occurance')
ax.set_xticks(ind+width)
ax.set_title("Polarity Between Other, Liberal, Conservative News Outlets")
ax.set_xticklabels( ('Negative Polarity', 'Positive Polarity') )
ax.legend( (rects1[0], rects2[0], rects3[0]), ('Other', 'Liberal', 'Conservative'), bbox_to_anchor=(1, 1), loc=2)
plt.show()
Overall, we see that all 3 types of news outlets had a higher positive polarity noun count than negative.
#Puts all the titles together into a list according to category
total = huff + abc + economist + fox + npr + nytimes + washington + wsj + breitbart
conserv = fox + breitbart
liber = huff + abc + nytimes + washington
othernews = economist + npr + wsj
def token_word(all_titles):
"""
This function takes in a list of titles
and removes stop words and Trump's administration names
and removes punctuation
Input: List
Output: List of single words
"""
tokentit = [re.sub("[^a-zA-Z]"," ",i) for i in all_titles]
tokenword = [word_tokenize(w) for w in tokentit]
merged = list(itertools.chain(*tokenword))
words = [w for w in merged if not w in stopwords.words("english")]
trumps_admin = ['whitehouse','Trump','conway','sessions','pence','president','tillerson', 'Spicer', 'devos','march', 'flynn','kushner','carson','department','preibus','bannon','spicer','miller', 'steve','Donald','jeff','sean', 'trump', 'President', 'donald', 'white', 'White', 'house', 'House', 'u']
wording = [w for w in words if not w in trumps_admin]
new_words = []
total_titles = [line.translate(None, string.punctuation) for line in wording]
token_word = [new_words.extend(word_tokenize(w)) for w in wording]
filtered_words = [word for word in new_words if word.lower() not in stopwords.words('english')]
return filtered_words
#Tokenize all news
combined_news = token_word(total)
conserv_news = token_word(conserv)
liber_news = token_word(liber)
other_news = token_word(othernews)
huff_tok = token_word(huff)
abc_tok = token_word(abc)
econ_tok = token_word(economist)
fox_tok = token_word(fox)
npr_tok = token_word(npr)
nyt_tok = token_word(nytimes)
wash_tok = token_word(washington)
bb_tok = token_word(breitbart)
wsj_tok = token_word(wsj)
#Count the top 20 most frequently used words and put in a dataframe
df_combined = pd.DataFrame(Counter(combined_news).most_common()[0:20])
df_c = pd.DataFrame(Counter(conserv_news).most_common()[0:20])
df_o = pd.DataFrame(Counter(other_news).most_common()[0:20])
df_l = pd.DataFrame(Counter(liber_news).most_common()[0:20])
df_huff = pd.DataFrame(Counter(huff_tok).most_common()[0:20])
df_abc = pd.DataFrame(Counter(abc_tok).most_common()[0:20])
df_econ = pd.DataFrame(Counter(econ_tok).most_common()[0:20])
df_fox = pd.DataFrame(Counter(fox_tok).most_common()[0:20])
df_npr = pd.DataFrame(Counter(npr_tok).most_common()[0:20])
df_nyt = pd.DataFrame(Counter(nyt_tok).most_common()[0:20])
df_wash = pd.DataFrame(Counter(wash_tok).most_common()[0:20])
df_bb = pd.DataFrame(Counter(bb_tok).most_common()[0:20])
df_wsj = pd.DataFrame(Counter(wsj_tok).most_common()[0:20])
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_combined.ix[:, 1].values, width)
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in All News Outlets")
ax.set_xticklabels(df_combined[0], rotation = 90)
plt.show()
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_l.ix[:, 1].values, width, color = "#ffbf80")
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in Liberal News Outlets")
ax.set_xticklabels(df_l[0], rotation = 90)
plt.show()
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_c.ix[:, 1].values, width, color = "#b30077")
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in Conservative News Outlets")
ax.set_xticklabels(df_c[0], rotation = 90)
plt.show()
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_o.ix[:, 1].values, width, color = "#ff8080")
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in Other News Outlets")
ax.set_xticklabels(df_o[0], rotation = 90)
plt.show()
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_huff.ix[:, 1].values, width, color = "#993300")
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in Huffington Post")
ax.set_xticklabels(df_huff[0], rotation = 90)
plt.show()
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_abc.ix[:, 1].values, width)
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in ABC News")
ax.set_xticklabels(df_abc[0], rotation = 90)
plt.show()
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_econ.ix[:, 1].values, width, color = "#ff8080")
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in Economist")
ax.set_xticklabels(df_econ[0], rotation = 90)
plt.show()
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_fox.ix[:, 1].values, width, color = "#7575a3")
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in Fox News")
ax.set_xticklabels(df_fox[0], rotation = 90)
plt.show()
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_npr.ix[:, 1].values, width, color = "#ff8080")
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in NPR")
ax.set_xticklabels(df_npr[0], rotation = 90)
plt.show()
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_wsj.ix[:, 1].values, width, color = "#99ceff")
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in WSJ")
ax.set_xticklabels(df_wsj[0], rotation = 90)
plt.show()
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_wash.ix[:, 1].values, width, color = "#2d862d")
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in Washington Post")
ax.set_xticklabels(df_wash[0], rotation = 90)
plt.show()
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_bb.ix[:, 1].values, width, color = "#a366ff")
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in Breitbart")
ax.set_xticklabels(df_bb[0], rotation = 90)
plt.show()
ind = np.arange(20)
width = 0.8
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, df_nyt.ix[:, 1].values, width, color = "#b3003b")
ax.set_ylabel('Frequency')
ax.set_xlabel('Word')
ax.set_xticks(ind+width)
ax.set_title("Frequency of Most Common Words in NYTimes")
ax.set_xticklabels(df_nyt[0], rotation = 90)
plt.show()
We first obtain two lists (polarity and subjectivity for each website).
fox_polar, fox_subj = polar_subj(fox)
abc_polar, abc_subj = polar_subj(abc)
wsj_polar, wsj_subj = polar_subj(wsj)
npr_polar, npr_subj = polar_subj(npr)
nyt_polar, nyt_subj = polar_subj(nytimes)
wash_polar, wash_subj = polar_subj(washington)
huff_polar, huff_subj = polar_subj(huff)
breit_polar, breit_subj = polar_subj(breitbart)
econ_polar, econ_subj = polar_subj(economist)
Plot the article title polarity values for each news outlet
polar_to_plot = [huff_polar, abc_polar, nyt_polar, wash_polar, fox_polar, breit_polar, npr_polar, wsj_polar, econ_polar]
fig = plt.figure(1, figsize=(10, 10))
ax = fig.add_subplot(111)
bp = ax.boxplot(polar_to_plot, showmeans=True)
pylab.ylim([-1.1,1.1])
ax.set_title("Boxplot of Article Title Polarity for News Outlets")
ax.set_ylabel('Polarity')
ax.set_xticklabels( ('Huffington Post', 'ABC News', 'NYTimes', 'Washington Post', 'Fox News', 'Breitbart', 'NPR', 'WSJ', 'Economist') )
plt.xticks(rotation=90)
plt.show()
From this boxplot we see that most of the article title polarities are mainly slightly positive with the exception of Huffington Post and Wall Street Journal which seems to be a little more negative.
Plot the article title subjectivity values for each news outlet
subj_to_plot = [huff_subj, abc_subj, nyt_subj, wash_subj, fox_subj, breit_subj, npr_subj, wsj_subj, econ_subj]
fig = plt.figure(1, figsize=(9, 6))
ax = fig.add_subplot(111)
bp = ax.boxplot(subj_to_plot, showmeans=True)
pylab.ylim([-0.1,1.1])
ax.set_title("Boxplot of Article Title Subjectivity for News Outlets")
ax.set_ylabel('Subjectivity')
ax.set_xticklabels( ('Huffington Post', 'ABC News', 'NYTimes', 'Washington Post', 'Fox News', 'Breitbart', 'NPR', 'WSJ', 'Economist') )
plt.xticks(rotation=90)
plt.show()
Overall, all of the article titles are subjective. Washington post seems to have the highest average subjectivity level whereas Wall Street Journal has the lowest average subjectivity level (and the lowest range in subjectivity percentage).
Plot the Polarity of Conservative, Liberal, and Other News Outlets
grouped_polar = [huff_polar+abc_polar+nyt_polar+wash_polar, fox_polar+breit_polar, npr_polar+wsj_polar+econ_polar]
fig = plt.figure(1, figsize=(9, 6))
ax = fig.add_subplot(111)
bp = ax.boxplot(grouped_polar, showmeans=True)
pylab.ylim([-1.1,1.1])
ax.set_title("Boxplot of Article Title Polarity Between News Outlets")
ax.set_ylabel('Polarity')
ax.set_xticklabels( ('Liberal', 'Conservative', 'Other') )
plt.xticks(rotation=90)
plt.show()
From this boxplot we see grouping the articles by type of newspaper leads to an inconclusive result. They all seem to have a polarity level of 0 (or close to 0) with a wide range of other polarities (from -1 to 1 with the exception of other).
Plot the Subjectivity of Conservative, Liberal, and Other News Outlets
grouped_subj = [huff_subj+abc_subj+nyt_subj+wash_subj, fox_subj+breit_subj, npr_subj+wsj_subj+econ_subj]
fig = plt.figure(1, figsize=(9, 6))
ax = fig.add_subplot(111)
bp = ax.boxplot(grouped_subj, showmeans=True)
pylab.ylim([-0.1,1.1])
ax.set_title("Boxplot of Article Title Subjectivity Between News Outlets")
ax.set_ylabel('Subjectivity')
ax.set_xticklabels( ('Liberal', 'Conservative', 'Other') )
plt.xticks(rotation=90)
plt.show()
The results here are slightly more clear. The liberal and other newspapers have a higher subjectivity level than conservative newspaper. But all three are subjective.
Make a dataframe of all of the titles
organize = pd.DataFrame(columns =['title','type'])
liberal = huff+nytimes+abc+washington
conservative = breitbart+fox
other = npr+economist+wsj
total_titles = liberal+conservative+other
Create a dataframe of all of the titles and the type of newspaper they came from.
for i in range(len(total_titles)):
if i <= len(liberal):
organize.loc[i] = [total_titles[i], "Liberal"]
if (i > len(liberal) and i <= len(liberal+conservative)):
organize.loc[i] = [total_titles[i], "Conservative"]
if i > len(liberal+conservative):
organize.loc[i] = [total_titles[i], "Other"]
tfidf = TfidfVectorizer().fit_transform(total_titles)
pairwise_similarity = tfidf * tfidf.T
Get all tuples of titles with greatest similarity
x = []
y = []
for row in range(pairwise_similarity.shape[0]):
for col in range(pairwise_similarity.shape[1]):
if pairwise_similarity[row,col] >= 0.75:
if round(pairwise_similarity[row,col],4) != 1:
x.append(row)
y.append(col)
similar_titles = zip(x,y)
Get all unique tuples
unique = []
for item in similar_titles:
if not (item in unique or tuple([item[1], item[0]]) in unique):
unique.append(item)
Find the titles that are associated with tuples
sim_titles = []
for t in unique:
sim_titles.append(zip(organize.loc[t[0]], organize.loc[t[1]]))
pd.set_option('display.width', 1000)
similar_title = pd.DataFrame(sim_titles, columns =['Similar Titles','Type'])
similar_title = pd.concat([similar_title.iloc[0:5],similar_title.iloc[8:14],similar_title.iloc[16:20],similar_title.iloc[24:25]])
similar_title = similar_title.set_index([range(len(similar_title))])
Dataframe of pairwise title comparisons
pd.options.display.max_colwidth = 300
similar_title
Comparing the titles for liberal, conservative, and other news sources for the most similar titles, there are the most similarites within groups rather than between groups. Liberal news sources share the most similar titles with other liberal news sources. Conservative news sources also have the most similar titles to other conservative news sources. This may mean that the type of news that liberal news sources cover are very different from those of conservative news sources. Conservative news sources never share a similar article title with liberal news sources. As shown in the table, conservative news sources and liberal news also share similar titles to "other" news sources. This implies that "other" news sources cover some liberal and some conservative news.
superfindf = pd.read_csv('analysis.csv')
[-8.5,-4.5,0,4.5,8.5]
x = range(-10,11)
y = [0]*21
y2 = range(-1,2)
x2 = [0]*3
superfindf['changedpose'] = (superfindf['pose']*2 - 1)
fig, ax = plt.subplots()
ax.plot(x,y,c = 'red')
ax.plot(x2,y2,c = 'red')
ax.scatter(x = superfindf['pew score'], y = superfindf['changedpose'],s = superfindf['betsum']*85);
for i in range(9):
ax.annotate(superfindf['news'][i],(superfindf['pew score'][i], superfindf['changedpose'][i]), size = 8)
ax.set_xlabel('Pew Ideology Score')
ax.set_ylabel('How Positive of Exec Branch')
ax.set_title('Trump\'s Perception in Media Vs Ideology')
plt.xlim(-10,10);
plt.ylim(-1,1);
plt.show()
superfindf['difference'] = superfindf['pose'] - superfindf['norms']
xline = [0,1]
yline = xline
fig, ax = plt.subplots()
ax.scatter(x = superfindf['norms'], y = superfindf['pose'],s = superfindf['betsum']*85);
for i in range(9):
ax.annotate(superfindf['news'][i],(superfindf['norms'][i], superfindf['pose'][i]), size = 8)
ax.set_xlabel('Random Sample Sentiment')
ax.set_ylabel('Positivity of Exec Branch')
ax.set_title('Random Sample of Titles Sentiments Versus Trump')
ax.plot(xline,c = 'black')
plt.xlim(0,1);
plt.ylim(0,1);
plt.show()
fig, ax = plt.subplots()
ax.plot(x,y,c = 'red')
ax.plot(x2,y2,c = 'red')
ax.scatter(x = superfindf['pew score'], y = superfindf['difference'],s = superfindf['betsum']*85);
for i in range(9):
if(superfindf['news'][i] != 'Economist'):
ax.annotate(superfindf['news'][i],(superfindf['pew score'][i], superfindf['difference'][i]), size = 8)
else:
ax.annotate(superfindf['news'][i],(superfindf['pew score'][i] -.1, superfindf['difference'][i] - .1), size = 8)
ax.set_xlabel('Pew Ideology Score')
ax.set_ylabel('Positity of Trump - Avg Sentiment')
ax.set_title('Trump\'s Perception in Media Vs Ideology')
plt.xlim(-10,10);
plt.ylim(-1,1);
plt.show()
numTitles = pd.DataFrame([len(huff), len(abc), len(economist), len(fox),len(npr),len(nytimes),len(washington),len(wsj),len(breitbart)])
numTitles = numTitles.transpose()
numTitles.columns = ["huff", "abc", "economist", "fox", "npr", "nytimes", "washginton", "wsj", "breitbart"]
numTitles
From here we see that we don't have the same number of titles for each news outlet. This can lead to unbiased results.
One thing we noticed when examining the titles is that some of the titles are sarcastic. This is not something that NLTK nor TextBlob can decipher. Thus sometimes is determined to be overall positive when it really is negative.
Examples of negative sentiments:
for i in range(0, )
for i in range(0, len(breitbart)):
blob = TextBlob(breitbart[i])
if blob.sentiment.polarity < 0:
break
blob, blob.sentiment
blob = TextBlob(breitbart[9])
blob.sentiment
title_polarity.append(blob.sentiment.polarity)
title_subj.append(blob.sentiment.subjectivity)
for totalArticles in everything:
for j in range(0, totalArticles)
blob = TextBlob(totalArticles[])
if blob.sentiment.polarity < 0:
break
blob, blob.sentiment
def getTitles(news):
negative = []
positive = []
for i in range(0, len(news)):
blob = TextBlob(news[i])
if blob.sentiment.polarity < 0 and negative == []:
negative.append(blob)
negative.append(blob.sentiment.polarity)
if blob.sentiment.polarity > 0 and positive == []:
positive.append(blob)
positive.append(blob.sentiment.polarity)
if positive != [] and negative != []:
break
return negative, positive
The variables below have a random title that has negative and positive polarity.
pHuff = getTitles(huff)
pABC = getTitles(abc)
pEco = getTitles(economist)
pfox = getTitles(fox)
pnpr = getTitles(npr)
pnyt = getTitles(nytimes)
pWash = getTitles(washington)
pwsj = getTitles(wsj)
pbreitbart = getTitles(breitbart)
Text blob issued the title "Trump Suggests 'The Apprentice' Has Died A 'Sad' Death" as a negative title. However, reading this we can clearly see that this title is more sarcastic than it is negative. A positive title is "Trump Has The Best Words, And He Even Invented A New One." This too is sarcastic. Without reading the article itself, one would think that the writer is poking fun at Trump.
pHuff