University towns are defined by their student populations. Are universities are less prone to the typical effects of a recession due to their reliable student populations? Our hypothesis will be that the average housing prices in college towns are less affected by recessions. We will run a t-test to compare the ratio of the mean price of houses in university towns to non-university towns the quarter before the 2008 recession starts compared to the recession bottom. Our null hypothesis will be that there is no difference between means and will use a significance level of alpha = .01.
We will be using Zillow research data which has housing data for the United States and all homes at the city level, which has median home sale prices. Wikipedia has a list of university towns in the US and the Bureau of Economic Analysis, has the GDP over time of the United States in current dollars by quarter.
Our t-tests resulted in p<.01, indicating that university towns were more resilient during the recession in 2008. Further research should be done to establish if this is a pattern, or if the 2008 recession was unique in some way. However, these preliminary findings encourage us to look into the data further.
import pandas as pd
import numpy as np
import re
from scipy.stats import ttest_ind
# dictionary to map state names to two letter acronyms
states = {'OH': 'Ohio', 'KY': 'Kentucky', 'AS': 'American Samoa', 'NV': 'Nevada',
'WY': 'Wyoming', 'NA': 'National', 'AL': 'Alabama', 'MD': 'Maryland',
'AK': 'Alaska', 'UT': 'Utah', 'OR': 'Oregon', 'MT': 'Montana', 'IL': 'Illinois',
'TN': 'Tennessee', 'DC': 'District of Columbia', 'VT': 'Vermont', 'ID': 'Idaho',
'AR': 'Arkansas', 'ME': 'Maine', 'WA': 'Washington', 'HI': 'Hawaii', 'WI': 'Wisconsin',
'MI': 'Michigan', 'IN': 'Indiana', 'NJ': 'New Jersey', 'AZ': 'Arizona', 'GU': 'Guam',
'MS': 'Mississippi', 'PR': 'Puerto Rico', 'NC': 'North Carolina', 'TX': 'Texas',
'SD': 'South Dakota', 'MP': 'Northern Mariana Islands', 'IA': 'Iowa', 'MO': 'Missouri',
'CT': 'Connecticut', 'WV': 'West Virginia', 'SC': 'South Carolina', 'LA': 'Louisiana',
'KS': 'Kansas', 'NY': 'New York', 'NE': 'Nebraska', 'OK': 'Oklahoma', 'FL': 'Florida',
'CA': 'California', 'CO': 'Colorado', 'PA': 'Pennsylvania', 'DE': 'Delaware',
'NM': 'New Mexico', 'RI': 'Rhode Island', 'MN': 'Minnesota', 'VI': 'Virgin Islands',
'NH': 'New Hampshire', 'MA': 'Massachusetts', 'GA': 'Georgia', 'ND': 'North Dakota',
'VA': 'Virginia'}
def get_list_of_university_towns():
'''Returns a DataFrame of towns states from Wikipeadia data'''
unitowns = pd.DataFrame(columns = ["State", "RegionName"])
with open('university_towns.txt', 'r') as f:
for line in f:
if '[' in line:
line = line.split('[')[0]
if '(' in line:
line = line.split('(')[0]
if line in states.values():
state = line
else:
unitowns.loc[len(unitowns)] = [state, line.strip()]
return unitowns
def get_recession_start():
'''Returns the year and quarter of the recession start time as a
string value in a format ex) 2005q3'''
gdp = pd.read_excel('gdplev.xls',header = 4)
gdp.rename(columns = {'Unnamed: 4': 'Yearq'}, inplace = True)
gdp = gdp[gdp['Yearq']>= '2000Q1']
gdp = gdp[['Yearq', 'GDP in billions of chained 2009 dollars.1' ]]
gdp['Shift'] = gdp['GDP in billions of chained 2009 dollars.1'].shift(1)
gdp['Shift2'] = gdp['GDP in billions of chained 2009 dollars.1'].shift(2)
gdp['ConsecDrop'] =(gdp['GDP in billions of chained 2009 dollars.1']
<gdp['Shift']) & (gdp['Shift'] <gdp['Shift2']) #compare consecutive quarters
recession_start_idx = gdp[gdp['ConsecDrop']==True].index[0]
recession_start = gdp.get_value(index =recession_start_idx-1, col = 'Yearq')
return recession_start
def get_recession_end():
'''Returns the year and quarter of the recession end time as a
string value in a format ex) 2005q3'''
gdp = pd.read_excel('gdplev.xls',header = 4)
gdp.rename(columns = {'Unnamed: 4': 'Yearq'}, inplace = True)
recession_start = get_recession_start()
gdp = gdp[gdp['Yearq']>= recession_start]
gdp = gdp[['Yearq', 'GDP in billions of chained 2009 dollars.1' ]]
gdp['Shift'] = gdp['GDP in billions of chained 2009 dollars.1'].shift(1)
gdp['Shift2'] = gdp['GDP in billions of chained 2009 dollars.1'].shift(2)
gdp['ConsecDrop'] =(gdp['GDP in billions of chained 2009 dollars.1'] >
gdp['Shift']) & (gdp['Shift'] >gdp['Shift2'])
recession_end_idx = gdp[gdp['ConsecDrop']==True].index[0]
recession_end = gdp.get_value(index =recession_end_idx, col = 'Yearq')
return recession_end
def get_recession_bottom():
'''Returns the year and quarter of the recession bottom time as a
string value in a format such as 2005q3'''
gdp = pd.read_excel('gdplev.xls',header = 4)
gdp.rename(columns = {'Unnamed: 4': 'Yearq'}, inplace = True)
recession_start = get_recession_start()
recession_end = get_recession_end()
gdp = gdp[gdp['Yearq']>= recession_start]
gdp = gdp[gdp['Yearq']<= recession_end]
recession_period_idx = gdp['GDP in billions of chained 2009 dollars.1'].idxmin(axis = 1)
recession_bottom = gdp.get_value(recession_period_idx, col = 'Yearq')
return recession_bottom
def convert_housing_data_to_quarters():
'''Converts the housing data to quarters and returns it as average
values in a dataframe.
columns: 2000q1 through 2016q3
multi-index: ["State","RegionName"].
'''
df_housing = pd.read_csv('City_Zhvi_AllHomes.csv')
quarters = ["20"+ "%02d" % (i,) + 'q'+str(j) for i in range(17) for j in range(1, 5) ]
quarters_dict = dict(zip([i for i in range(len(quarters))], quarters))
df_housing['State'] = df_housing['State'].map(states)
df_housing.set_index(['State', 'RegionName'], inplace = True)
idx = df_housing.columns.get_loc('2000-01')
df_housing = df_housing.ix[:, idx:]
df_h = df_housing.groupby(np.arange(len(df_housing.columns))//3, axis = 1).mean()
df_h.rename(columns = quarters_dict, inplace = True)
return df_h
def run_ttest():
'''Return the tuple (different, p, better) where different=True if the t-test is
True at a p<0.01 (we reject the null hypothesis), or different=False if
otherwise (we cannot reject the null hypothesis).'''
df_housing = convert_housing_data_to_quarters().copy()
recession_start = df_housing.columns.get_loc(get_recession_start())
recession_end = df_housing.columns.get_loc(get_recession_end())
quarter_before_recession = df_housing.columns[recession_start-1]
df_housing = df_housing.ix[:, recession_start-1: recession_end+1]
df_housing['price ratio'] = df_housing[quarter_before_recession]/df_housing[get_recession_bottom()]
unitowns = get_list_of_university_towns().copy()
unitowns.set_index(["State", 'RegionName'], inplace = True)
df_housing['isCollege'] = df_housing.index.isin(unitowns.index.tolist())
uni_housing = df_housing[df_housing['isCollege']== True]['price ratio'].dropna(how = any)
other_housing = df_housing[df_housing['isCollege']== False]['price ratio'].dropna(how = any)
p = ttest_ind(uni_housing.tolist(),other_housing.tolist())[1]
if p<.01:
different = True
better = 'university town'
if p>.01:
different = False
better = 'non university town'
return (different, p, better)
print(run_ttest())