We will be scraping the Billboard top 100 charts and artist/band pages on Wikipedia to learn about popular artist rankings from 1970 to 2015.
%matplotlib inline
import numpy as np
import scipy as sp
import pandas as pd
import time
import seaborn as sns
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
sns.set_style("whitegrid")
sns.set_context("poster")
import requests
from bs4 import BeautifulSoup
req = requests.get('http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1970')
page = req.text
soup = BeautifulSoup(page, 'html.parser')
tables_wikitable = soup.find_all('table', 'wikitable')
rows = [row for row in tables_wikitable[0].find_all('tr')]
def get_td(row):
return [td for td in row.find_all('td')]
def get_rank(td):
return td[0].string
def get_url(td):
return td[2].a['href']
def get_band_singer(td):
return td[2].a.string
def get_title(td):
if td[1].a:
return td[1].a['title']
else:
return td[1].string
#formating test
td_list = [get_td(row)for row in rows[1:]]
list_of_dicts = [{'url':get_url(td), 'ranking':get_rank(td),
'band_singer': get_band_singer(td), 'title':get_title(td)}
for td in td_list]
#generate list of urls
urls = ['http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{0}'.format(str(i)) for i in range(1970, 2015)]
def get_text(urls):
""""""
yearstext = {}
for url in urls:
req = requests.get(url)
yearstext[url.split('_')[-1]] = req.text
time.sleep(1)
return yearstext
yearstext = get_text(urls)
def parse_year(the_year, yeartext_dict):
"""
Inputs
------
the_year:
yeartext_dict: a dictionary with keys as integer years and values the downloaded web pages
from wikipedia for that year.
Returns
-------
a list of dictionaries corresponding to a single, with dictionaries formated:
band_singer: a list of bands/singers who made this single
song: a list of the titles of songs on this single
songurl: a list of the same size as song which has urls for the songs on the single
(see point 3 above)
ranking: ranking of the single
titletext: the contents of the table cell
band_singer: a list of bands or singers on this single
url: a list of wikipedia singer/band urls on this single: only put in the part
of the url from /wiki onwards
"""
soup = BeautifulSoup(yeartext_dict[str(the_year)], 'html.parser')
tables_wikitable = soup.find_all('table', 'wikitable')
rows = [row for row in tables_wikitable[0].find_all('tr')][1:]
yearinfo = [get_single_dict(row) for row in rows]
return yearinfo
def get_single_dict(row):
"""
input: list of contents in single row
output: dictionary of content from single row
"""
children = [child for child in row.children]
children = list(filter(lambda x: x != '\n', children))
ranking = children[0].string
band_singers = children[2].find_all('a')
band_singer = [band.string for band in band_singers]
url = [url['href'] for url in band_singers]
songs = children[1].find_all('a')
songurl = [song['href'] for song in songs]
#handles edge cases where row content is missing
if songurl == []:
songurl = [None]
song = [song.string for song in songs]
if not song:
song = children[1].string
if type(song) == list:
title = '/'.join(str(s) for s in song)
else:
title = song
single_dict = {'band_singer': band_singer, 'ranking': ranking, 'song': song,
'songurl':songurl, 'titletext':title, 'url': url}
return single_dict
#test to make sure parse_year runs correctly
#yearinfo = parse_year(1997, yearstext)
import json
#create json file for each year's Billboard Hot 100's: 1970-2014
for year in range(1970, 2015):
yearinfo = parse_year(year, yearstext)
fd = open("data/{0}info.json".format(str(year)),"w")
json.dump(yearinfo, fd)
fd.close()
#create DataFrame for Billboard Hot 100's: 1970-2014
with open("data/{0}info.json".format(str(year)), "r") as f:
curyearinfo = json.load(f)
if year == 1970:
flatframe = pd.DataFrame(curyearinfo)
flatframe['year'] = year
else:
year_df = pd.DataFrame(curyearinfo)
year_df['year'] = year
flatframe = flatframe.append(year_df)
flatframe.reset_index(inplace = True)
#clean dataframe, create new rows for singles with multiple band_singers
cols = ['band_singer', 'song', 'songurl', 'url']
for col in cols:
flatframe[col] = flatframe[col].apply(lambda x: x[0] if type(x) == list and len(x)==1 else x)
bands_expanded = flatframe.apply(lambda x: pd.Series(x['band_singer']),axis=1).stack().reset_index(level=1, drop=True)
urls_expanded = flatframe.apply(lambda x: pd.Series(x['url']),axis=1).stack().reset_index(level=1, drop=True)
df_bandrows = pd.concat([bands_expanded, urls_expanded], axis = 1)
df_bandrows.columns = ['band_singer', 'url']
flatframe = flatframe.drop(df_bandrows.columns, axis=1).join(df_bandrows)
#clean rankings, strip new line
flatframe.ranking = flatframe.ranking.str.strip('\n')
#create new row for singles with multiple songs
newframe =flatframe[flatframe.song.apply(lambda x: type(x) == list)]
flatframe_dropped = flatframe[flatframe.song.apply(lambda x: type(x) != list)]
songs_expanded = newframe.apply(lambda x: pd.Series(x['song']),axis=1).stack().reset_index(level=1, drop=True)
songurls_expanded = newframe.apply(lambda x: pd.Series(x['songurl']),axis=1).stack().reset_index(level=1, drop=True)
df_expanded = pd.concat([songs_expanded, songurls_expanded], axis = 1)
df_expanded.columns = ['song', 'songurl']
flatframe = flatframe.drop(df_expanded.columns, axis = 1).join(df_expanded, how = 'inner')
flatframe = pd.concat([flatframe_dropped, flatframe], sort = True)
flatframe = flatframe[['ranking', 'band_singer','song', 'songurl',
'titletext', 'url', 'year']]
flatframe = flatframe[~flatframe.ranking.isna()]
#handle case of ranking ties
flatframe.loc[:,0] = flatframe.ranking.apply(lambda x: '0' if x == 'Tie' else x)
flatframe = flatframe.astype({'ranking': int, 'url': str})
tied_rankings = flatframe[flatframe.ranking == 0].index
tied_values = [38, 55,86, 92, 92]
for i, tie in enumerate(tied_rankings):
flatframe.loc[tie, 'ranking' ] = tied_values[i]
with open("data/yearinfo.json", "r") as fd:
yearinfo = json.load(fd)
quality_counts = flatframe.band_singer.value_counts()
quality_bands = quality_counts[quality_counts.values>15]
f = plt.figure(figsize = (10,6), frameon = False)
sns.barplot(quality_bands.index, quality_bands.values, color = '#528ff2')
plt.xticks(rotation=90)
plt.title("Number of Billboard Top 100 Occurrences by Artist ");
Here we perform a sanity check to make sure our results are accurate. All of these artists/bands were extremely successful as we would expect. However, we might be surprised to see The Black Eyed Peas on this list. Even though The Black Eyed Peas were a successful group, we are not sure that they would have been reached the Billboard Hot 100 over 15 times. But a quick search of the <a href = https://www.billboard.com/music/the-black-eyed-peas> Billboard Hot 100 </a> shows that the Black Eyed Peas did, in fact, reach the Billboard Hot 100 16 times. We still aren't completely sure that they would be on the year-end Hot 100 list, but more research on the Billboard Hot 100 demonstrates that they were.
We are similarly suspicious seeing that Michael Jackson and Janet Jackson ended up right next to each other on our histogram. Again using the Billboard Hot 100 website we find that this checks out.
We should also note that we are counting songs where artists were featured, so this impacts our results. We argue that being featured in a song is part of an artist's overall success on the Billboard Top 100, however, it does impact our ability to compare ranking information, and should be taken account in the following analysis.
We might argue that a singer should to be scored higher if the singer appears higher in the rankings. Number of times on the Top 100 Billboard chart would not be enough to capture the differences in rankings on that chart.
To do this, group all of a singers songs together and assign each song a score 101 - ranking
.
flatframe['quality_score'] = 101 - flatframe['ranking']
quality_ranking = flatframe.groupby('band_singer').quality_score.sum().sort_values(ascending = False)
top_35 = quality_ranking.head(35)
f = plt.figure(figsize = (10,6), frameon = False)
sns.barplot(top_35.index, top_35.values, color = '#528ff2')
plt.xticks(rotation=90)
plt.xlabel('Artist')
plt.ylabel('Score: 101 - Ranking');
Now we see that Mariah Carey takes the number one spot and our histogram has changed quite a bit! Janet Jackson now ranks higher than Michael Jackson when we account for placement on the chart, which I was not expecting to see.
We could go further to account for songs where an artist is featured. We could do this by assigning weights that account for the artist being labeled as featured counting less toward their overall ranking. The choice of weights would be subjective, but the rankings would be more comparable. While our analysis does not take this into account, we should note this limitation in our findings.
Who just made it to the histogram that wasn't there before?
[band for band in top_35.index if band not in quality_bands.index]
Who dropped off our list?
[band for band in quality_bands.index if band not in top_35.index]
This is just an arbitrary cut off, but give a sense of how the 'most successful' artists can change when you adjust for factors like ranking.
urlcache={}
def get_page(url):
# Check if URL has already been visited.
if (url not in urlcache) or (urlcache[url]==1) or (urlcache[url]==2):
time.sleep(1)
try:
r = requests.get("http://en.wikipedia.org%s" % url)
if r.status_code == 200:
urlcache[url] = r.text
else:
urlcache[url] = 1
except:
urlcache[url] = 2
return urlcache[url]
#sort by year
flatframe=flatframe.sort_values('year')
flatframe.head()
#s = flatframe["url"].apply(get_page)
print("Number of bad requests:",
np.sum([(urlcache[k]==1) or (urlcache[k]==2) for k in urlcache]))
print("Did we get all urls?", len(flatframe.url.unique())==len(urlcache))
#run once to do json dump
"""with open("data/artistinfo.json","w") as fd:
json.dump(urlcache, fd)
del urlcache""";
with open("data/artistinfo.json") as json_file:
urlcache = json.load(json_file)
import bs4
def singer_band_info(url, page_text):
"""Inputs
------
url: the url
page_text: the text associated with the url
Returns
-------
A dictionary:
url: input argument url
born: the artist's birthday
ya: years active variable"""
born = False
ya = False
soup = BeautifulSoup(page_text,'html.parser')
table_strings = [th.string for th in soup.find_all('th')]
#get and clean year born data
if "Born" in table_strings:
year_born_td = [tr.td for tr in soup.find_all('tr') if
tr.th and tr.th.string == 'Born'][0]
for child in year_born_td.children:
span = child.find('span')
if type(span) == bs4.element.Tag and span.span:
born = span.span.string
elif type(span) == bs4.element.Tag and span['class'] == ['bday']:
born = span.string
#get and clean year active data
elif "Years active" in table_strings:
years_active_td = [tr.td for tr in soup.find_all('tr') if
tr.th and tr.th.string == 'Years active'][0]
if years_active_td.string:
ya = years_active_td.string.split("–")
elif not years_active_td.find_all('li'):
ya = [date for date in years_active_td.contents
if type(date) == bs4.element.NavigableString]
else:
ya = []
for li in years_active_td.find_all('li'):
ya += [date for date in li.contents
if type(date) == bs4.element.NavigableString]
if len(ya) == 1:
ya = ya[0].split('–')
return {"url": url, "born": born,"ya": ya}
#create dataframe from dictionary
band_singer_info_df = pd.DataFrame(columns =['url', 'born', 'ya'])
for key, value in urlcache.items():
if type(value) != int:
band_singer_info_df = band_singer_info_df.append(singer_band_info(key, value), ignore_index = True);
#merge band info dataframe with ranking dataframe
band_info_ranking_df = flatframe.merge(band_singer_info_df, on = 'url', how = "outer")
from dateutil.parser import *
from dateutil.parser import *
from dateutil.relativedelta import relativedelta
def date_parse(date):
if date and type(date) == bs4.NavigableString:
return parse(str(date))
else:
return None
singers = band_info_ranking_df[band_info_ranking_df.born !=False]
singers = singers.drop_duplicates(subset = 'band_singer')
singers.year = singers.year.astype(int)
singers.born = singers.born.apply(date_parse)
singers.year = singers.year.astype(str).apply(parse)
singers['age_ranked_days'] = singers.year-singers.born
#convert nanoseconds to years
singers.age_ranked_days = singers.age_ranked_days.astype(int)*3.171e-17
singers = singers[singers.age_ranked_days >0]
plt.figure(figsize=(10,6))
plt.hist(singers.age_ranked_days.values, bins = 20)
plt.title("Age Top Hit was Achieved")
plt.xlabel("Age")
plt.show()
The age a singer's top hit was achieved takes on a roughly normal shape. We do see less young singer achieving top hits than older singers, however, the negative skew is not as prominent as one might expect considering the advantage of experience. We might be seeing this because singing ability declines with age, or because it is difficult to create extremely popular, relevant music at older ages.
bands = band_info_ranking_df[band_info_ranking_df.ya != False]
def get_year_inception(years):
"""clean years active to standard format"""
if type(years) == float:
return years
if type(years) == str or type(years) == bs4.element.NavigableString:
if years.isdigit():
return years.split('−')[0].split('–')[0]
return years.split('−')[0].split('s')[0].split('–')[0]
if "–" in years[0]:
return years[0].split("–")[0]
if "-" in years[0]:
return years[0].split('-')[0]
else:
return years[0].split('−')[0].split('s')[0].split('–')[0]
bands = bands[bands.ya != '' ]
#handles edge cases for years active format
band_year_inception = bands.ya.apply(get_year_inception)
band_year_inception = band_year_inception.apply(lambda x: x if x != '1997—1999, 2014' else 0)
band_year_inception = band_year_inception.apply(lambda x: x if x != 'Early 1960s' else 0)
band_year_inception = band_year_inception.apply(lambda x: x if type(x)!= float else 0)
band_year_inception = band_year_inception.apply(lambda x: x if x != '' else 0)
band_year_inception["since_inception"] = bands.year - band_year_inception.values.astype(int)
band_incep_sorted = band_year_inception.since_inception.sort_values(ascending = False)
band_incep_sorted = band_incep_sorted[band_incep_sorted <1900]
plt.figure(figsize=(10,6))
plt.hist(band_incep_sorted, bins = 20)
plt.xticks(np.arange(0,50, 5))
plt.title("Years Since Inception at Which Bands Reach Their Top Ranking")
plt.xlabel("Year")
plt.show()
More bands achieve their top hit at year 5 than at any other year. The majority of bands achieve their top hit at or before year six.