Scraping Billboard Top 100 on Wikipedia

Who are the top 100?

In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import pandas as pd
import time
import seaborn as sns

import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt


pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
sns.set_style("whitegrid")
sns.set_context("poster")

Scraping the Billboard Top 100

In [3]:
import requests 
from bs4 import BeautifulSoup

req = requests.get('http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1970')
page = req.text
soup = BeautifulSoup(page, 'html.parser')
tables_wikitable = soup.find_all('table', 'wikitable')
rows = [row for row in tables_wikitable[0].find_all('tr')]


def get_td(row):
    return [td for td in row.find_all('td')]

def get_rank(td):
    return td[0].string

def get_url(td):
    return td[2].a['href']

def get_band_singer(td):
    return td[2].a.string

def get_title(td):
    if td[1].a:
        return td[1].a['title']
    else:
        return td[1].string
    
    
#formating test
td_list = [get_td(row)for row in rows[1:]]
list_of_dicts = [{'url':get_url(td), 'ranking':get_rank(td), 
                'band_singer': get_band_singer(td), 'title':get_title(td)} 
                for td in td_list]
In [4]:
#generate list of urls 
urls = ['http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{0}'.format(str(i)) for i in range(1970, 2015)]

def get_text(urls):
    """"""
    yearstext = {}
    for url in urls:
        req = requests.get(url)
        yearstext[url.split('_')[-1]] = req.text
        time.sleep(1)
    return yearstext

yearstext = get_text(urls)
In [5]:
def parse_year(the_year, yeartext_dict):
    """
    Inputs
    ------
    the_year: 
    yeartext_dict: a dictionary with keys as integer years and values the downloaded web pages 
    from wikipedia for that year.
   
    Returns
    -------

    a list of dictionaries corresponding to a single, with dictionaries formated:

        band_singer: a list of bands/singers who made this single
        song: a list of the titles of songs on this single
        songurl: a list of the same size as song which has urls for the songs on the single 
            (see point 3 above)
        ranking: ranking of the single
        titletext: the contents of the table cell
        band_singer: a list of bands or singers on this single
        url: a list of wikipedia singer/band urls on this single: only put in the part 
        of the url from /wiki onwards
    
"""

    soup = BeautifulSoup(yeartext_dict[str(the_year)], 'html.parser')
    tables_wikitable = soup.find_all('table', 'wikitable')
    rows = [row for row in tables_wikitable[0].find_all('tr')][1:]
    yearinfo = [get_single_dict(row) for row in rows]
    return yearinfo



def get_single_dict(row):
    """
    input: list of contents in single row
    output: dictionary of content from single row
    """
    
    children = [child for child in row.children]
    children = list(filter(lambda x: x != '\n', children))
    ranking = children[0].string
    band_singers = children[2].find_all('a')
    band_singer = [band.string for band in band_singers]
    url = [url['href'] for url in band_singers]
    songs = children[1].find_all('a')
    songurl = [song['href'] for song in songs]
    
    #handles edge cases where row content is missing
    if songurl == []:
        songurl = [None]
    song = [song.string for song in songs]
    if not song:
        song = children[1].string
        
    if type(song) == list:
        title = '/'.join(str(s) for s in song)
    else:
        title = song

    single_dict = {'band_singer': band_singer, 'ranking': ranking, 'song': song,
                   'songurl':songurl, 'titletext':title, 'url': url}
    return single_dict
In [6]:
#test to make sure parse_year runs correctly

#yearinfo = parse_year(1997, yearstext)

Save a json file of information from the scraped files

In [7]:
import json
In [8]:
#create json file for each year's Billboard Hot 100's: 1970-2014
for year in range(1970, 2015):
    yearinfo = parse_year(year, yearstext)
    fd = open("data/{0}info.json".format(str(year)),"w")
    json.dump(yearinfo, fd)
    fd.close()
  
    #create DataFrame for Billboard Hot 100's: 1970-2014
    with open("data/{0}info.json".format(str(year)), "r") as f:
        curyearinfo = json.load(f)
       
        if year == 1970:
            
            flatframe = pd.DataFrame(curyearinfo)
            flatframe['year'] = year
        else:
            year_df = pd.DataFrame(curyearinfo)
            year_df['year'] = year
            flatframe = flatframe.append(year_df)

    
In [9]:
flatframe.reset_index(inplace = True)
In [10]:
#clean dataframe, create new rows for singles with multiple band_singers

cols = ['band_singer', 'song', 'songurl', 'url']
for col in cols:
    flatframe[col] = flatframe[col].apply(lambda x: x[0] if type(x) == list and len(x)==1 else x) 
    
bands_expanded = flatframe.apply(lambda x: pd.Series(x['band_singer']),axis=1).stack().reset_index(level=1, drop=True)
urls_expanded = flatframe.apply(lambda x: pd.Series(x['url']),axis=1).stack().reset_index(level=1, drop=True)
df_bandrows = pd.concat([bands_expanded, urls_expanded], axis = 1)
df_bandrows.columns = ['band_singer', 'url']

flatframe = flatframe.drop(df_bandrows.columns, axis=1).join(df_bandrows)
#clean rankings, strip new line
flatframe.ranking = flatframe.ranking.str.strip('\n')
In [11]:
#create new row for singles with multiple songs 

newframe =flatframe[flatframe.song.apply(lambda x: type(x) == list)]
flatframe_dropped = flatframe[flatframe.song.apply(lambda x: type(x) != list)]

songs_expanded = newframe.apply(lambda x: pd.Series(x['song']),axis=1).stack().reset_index(level=1, drop=True)
songurls_expanded = newframe.apply(lambda x: pd.Series(x['songurl']),axis=1).stack().reset_index(level=1, drop=True)


df_expanded = pd.concat([songs_expanded, songurls_expanded], axis = 1)
df_expanded.columns = ['song', 'songurl']

flatframe = flatframe.drop(df_expanded.columns, axis = 1).join(df_expanded, how = 'inner')
flatframe = pd.concat([flatframe_dropped, flatframe], sort = True)
flatframe = flatframe[['ranking', 'band_singer','song', 'songurl', 
                         'titletext', 'url', 'year']]
In [12]:
flatframe = flatframe[~flatframe.ranking.isna()]
In [13]:
#handle case of ranking ties
flatframe.loc[:,0] = flatframe.ranking.apply(lambda x: '0' if x == 'Tie' else x)

flatframe = flatframe.astype({'ranking': int, 'url': str})
tied_rankings = flatframe[flatframe.ranking == 0].index
tied_values = [38, 55,86, 92, 92]
for i, tie in enumerate(tied_rankings):
    flatframe.loc[tie, 'ranking' ] = tied_values[i]
    
In [14]:
with open("data/yearinfo.json", "r") as fd:
    yearinfo = json.load(fd)

Who made it to the Top 100 the most?

In [17]:
quality_counts = flatframe.band_singer.value_counts() 

quality_bands = quality_counts[quality_counts.values>15]

f = plt.figure(figsize = (10,6), frameon = False)
sns.barplot(quality_bands.index, quality_bands.values, color = '#528ff2')
plt.xticks(rotation=90)
plt.title("Number of Billboard Top 100 Occurrences by Artist ");

Checking our results

Here we perform a sanity check to make sure our results are accurate. All of these artists/bands were extremely successful as we would expect. However, we might be surprised to see The Black Eyed Peas on this list. Even though The Black Eyed Peas were a successful group, we are not sure that they would have been reached the Billboard Hot 100 over 15 times. But a quick search of the <a href = https://www.billboard.com/music/the-black-eyed-peas> Billboard Hot 100 </a> shows that the Black Eyed Peas did, in fact, reach the Billboard Hot 100 16 times. We still aren't completely sure that they would be on the year-end Hot 100 list, but more research on the Billboard Hot 100 demonstrates that they were.

We are similarly suspicious seeing that Michael Jackson and Janet Jackson ended up right next to each other on our histogram. Again using the Billboard Hot 100 website we find that this checks out.

We should also note that we are counting songs where artists were featured, so this impacts our results. We argue that being featured in a song is part of an artist's overall success on the Billboard Top 100, however, it does impact our ability to compare ranking information, and should be taken account in the following analysis.

Capturing Ranking Information

We might argue that a singer should to be scored higher if the singer appears higher in the rankings. Number of times on the Top 100 Billboard chart would not be enough to capture the differences in rankings on that chart.

To do this, group all of a singers songs together and assign each song a score 101 - ranking.

In [18]:
flatframe['quality_score'] = 101 - flatframe['ranking']

quality_ranking = flatframe.groupby('band_singer').quality_score.sum().sort_values(ascending = False)
top_35 = quality_ranking.head(35)

f = plt.figure(figsize = (10,6), frameon = False)
sns.barplot(top_35.index, top_35.values, color = '#528ff2')
plt.xticks(rotation=90)
plt.xlabel('Artist')
plt.ylabel('Score: 101 - Ranking');

Now we see that Mariah Carey takes the number one spot and our histogram has changed quite a bit! Janet Jackson now ranks higher than Michael Jackson when we account for placement on the chart, which I was not expecting to see.

We could go further to account for songs where an artist is featured. We could do this by assigning weights that account for the artist being labeled as featured counting less toward their overall ranking. The choice of weights would be subjective, but the rankings would be more comparable. While our analysis does not take this into account, we should note this limitation in our findings.

Who just made it to the histogram that wasn't there before?

In [19]:
[band for band in top_35.index if band not in quality_bands.index]
Out[19]:
['50 Cent',
 'Bruno Mars',
 'Maroon 5',
 'Boyz II Men',
 'Christina Aguilera',
 'Phil Collins']

Who dropped off our list?

In [20]:
[band for band in quality_bands.index if band not in top_35.index]
Out[20]:
['Britney Spears', 'Kelly Clarkson', 'Taylor Swift', 'Chicago', 'Ne-Yo']

This is just an arbitrary cut off, but give a sense of how the 'most successful' artists can change when you adjust for factors like ranking.

Scraping the band pages

In [21]:
urlcache={}
In [22]:
def get_page(url):
    # Check if URL has already been visited.
    if (url not in urlcache) or (urlcache[url]==1) or (urlcache[url]==2):
        time.sleep(1)
        try:
            r = requests.get("http://en.wikipedia.org%s" % url)

            if r.status_code == 200:
                urlcache[url] = r.text
            else:
                urlcache[url] = 1
        except:
            urlcache[url] = 2
    return urlcache[url]
In [23]:
#sort by year
flatframe=flatframe.sort_values('year')
flatframe.head()
Out[23]:
ranking band_singer song songurl titletext url year 0 quality_score
0 1 Simon & Garfunkel Bridge over Troubled Water /wiki/Bridge_over_Troubled_Water_(song) Bridge over Troubled Water /wiki/Simon_%26_Garfunkel 1970 1 100
72 73 Creedence Clearwater Revival Up Around the Bend /wiki/Up_Around_the_Bend Up Around the Bend /wiki/Creedence_Clearwater_Revival 1970 73 28
71 72 Elvis Presley The Wonder of You /wiki/The_Wonder_of_You The Wonder of You /wiki/Elvis_Presley 1970 72 29
70 71 The Delfonics Didn't I (Blow Your Mind This Time) /wiki/Didn%27t_I_(Blow_Your_Mind_This_Time) Didn't I (Blow Your Mind This Time) /wiki/The_Delfonics 1970 71 30
69 70 The Guess Who No Time /wiki/No_Time_(The_Guess_Who_song) No Time /wiki/The_Guess_Who 1970 70 31

Pulling and saving the data

In [24]:
#s = flatframe["url"].apply(get_page)
In [25]:
print("Number of bad requests:",
      np.sum([(urlcache[k]==1) or (urlcache[k]==2) for k in urlcache]))
print("Did we get all urls?", len(flatframe.url.unique())==len(urlcache)) 
Number of bad requests: 0.0
Did we get all urls? False
In [26]:
#run once to do json dump
"""with open("data/artistinfo.json","w") as fd:
    json.dump(urlcache, fd)
del urlcache""";
In [27]:
with open("data/artistinfo.json") as json_file:
    urlcache = json.load(json_file)
In [28]:
import bs4 

def singer_band_info(url, page_text):
    """Inputs
    ------
    url: the url
    page_text: the text associated with the url
   
    Returns
    -------
    A dictionary:
    url: input argument url 
    born: the artist's birthday
    ya: years active variable"""
    
    born = False
    ya = False
    soup = BeautifulSoup(page_text,'html.parser') 
    table_strings = [th.string for th in soup.find_all('th')]
    
    #get and clean year born data
    if "Born" in table_strings:
        year_born_td = [tr.td for tr in soup.find_all('tr') if
                        tr.th and tr.th.string == 'Born'][0]
        for child in year_born_td.children:
            span = child.find('span')
         
            if type(span) == bs4.element.Tag and span.span:
                born = span.span.string
           
            elif type(span) == bs4.element.Tag and span['class'] == ['bday']:
                born = span.string  
                
    #get and clean year active data   
    elif "Years active" in table_strings:
        years_active_td = [tr.td for tr in soup.find_all('tr') if 
                        tr.th and tr.th.string == 'Years active'][0] 
        
        if years_active_td.string:
            ya = years_active_td.string.split("–")
       
        elif not years_active_td.find_all('li'):
            ya = [date for date in years_active_td.contents 
                  if type(date) == bs4.element.NavigableString]
     
        else:
            ya = []
            for li in years_active_td.find_all('li'):
                ya += [date for date in li.contents 
                       if type(date) == bs4.element.NavigableString]
                if len(ya) == 1:
                    ya = ya[0].split('–')
    
    return {"url": url, "born": born,"ya": ya}
In [29]:
#create dataframe from dictionary
band_singer_info_df = pd.DataFrame(columns =['url', 'born', 'ya'])

for key, value in urlcache.items():
    if type(value) != int:
        band_singer_info_df = band_singer_info_df.append(singer_band_info(key, value), ignore_index = True);
In [30]:
#merge band info dataframe with ranking dataframe
band_info_ranking_df = flatframe.merge(band_singer_info_df, on = 'url', how = "outer")
from dateutil.parser import *
In [31]:
from dateutil.parser import *
from dateutil.relativedelta import relativedelta

def date_parse(date):
    if date and type(date) == bs4.NavigableString:
        return parse(str(date))
    else:
        return None
    
singers = band_info_ranking_df[band_info_ranking_df.born !=False]
singers = singers.drop_duplicates(subset = 'band_singer')
singers.year = singers.year.astype(int)

singers.born = singers.born.apply(date_parse)
singers.year = singers.year.astype(str).apply(parse)
In [32]:
singers['age_ranked_days'] = singers.year-singers.born

#convert nanoseconds to years
singers.age_ranked_days = singers.age_ranked_days.astype(int)*3.171e-17


singers = singers[singers.age_ranked_days >0]

What is the age at which singers achieve their top ranking?

In [33]:
plt.figure(figsize=(10,6))
plt.hist(singers.age_ranked_days.values, bins = 20)
plt.title("Age Top Hit was Achieved")
plt.xlabel("Age")
plt.show()

The age a singer's top hit was achieved takes on a roughly normal shape. We do see less young singer achieving top hits than older singers, however, the negative skew is not as prominent as one might expect considering the advantage of experience. We might be seeing this because singing ability declines with age, or because it is difficult to create extremely popular, relevant music at older ages.

In [34]:
bands = band_info_ranking_df[band_info_ranking_df.ya != False]
In [35]:
def get_year_inception(years):
    """clean years active to standard format"""
    if type(years) == float:
        return years
    if type(years) == str or type(years) == bs4.element.NavigableString:
    
        if years.isdigit():
            return years.split('−')[0].split('–')[0]
        return years.split('−')[0].split('s')[0].split('–')[0]
    if "–" in years[0]:
        return years[0].split("–")[0]
    if "-" in years[0]:
        return years[0].split('-')[0]
    else:
        return years[0].split('−')[0].split('s')[0].split('–')[0]
In [36]:
bands = bands[bands.ya != '' ]

#handles edge cases for years active format
band_year_inception = bands.ya.apply(get_year_inception)
band_year_inception = band_year_inception.apply(lambda x: x if x != '1997—1999, 2014' else 0)
band_year_inception = band_year_inception.apply(lambda x: x if x != 'Early 1960s' else 0)
band_year_inception = band_year_inception.apply(lambda x: x if type(x)!= float else 0)
band_year_inception = band_year_inception.apply(lambda x: x if x != '' else 0)


band_year_inception["since_inception"] = bands.year - band_year_inception.values.astype(int)
band_incep_sorted = band_year_inception.since_inception.sort_values(ascending = False)
band_incep_sorted = band_incep_sorted[band_incep_sorted <1900]

At what year since inception do bands reach their top rankings?

In [37]:
plt.figure(figsize=(10,6))
plt.hist(band_incep_sorted, bins = 20)
plt.xticks(np.arange(0,50, 5))
plt.title("Years Since Inception at Which Bands Reach Their Top Ranking")
plt.xlabel("Year")
plt.show()

More bands achieve their top hit at year 5 than at any other year. The majority of bands achieve their top hit at or before year six.