# CHANGE THIS IF YOU NEED/WANT TOO

# pandas / numpy
import pandas as pd
import numpy as np
rng = np.random.default_rng() # for random number generation

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
from matplotlib.lines import Line2D

# datetime operations
from datetime import datetime

# ttest and euclidean distance
from scipy.stats import ttest_ind
from scipy.spatial.distance import seuclidean

# linear fit using statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

# good ole sklearn
from sklearn.metrics import euclidean_distances, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

# displaying markdown strings
from IPython.display import display, Markdown, Latex


# settings
rcParams['figure.dpi']= 100
pd.set_option('display.max_rows', 3000)
sns.set(style="whitegrid")


# data folder
#DATA_FOLDER = 'data/'
DATA_FOLDER = r'/Users/geoffraymarie/EPFL/ADA/ADA2021/Homework/02 - Regression Observational Studies Stats and Supervised Learning/data/'


df = pd.read_csv(DATA_FOLDER + 'pitchfork.csv.gz', compression='gzip', 
                 parse_dates=['reviewdate'], dtype={'releaseyear': int}
                )


shape_before = df.shape[0]
df.drop_duplicates(subset = ['artist','album'], keep = 'first', inplace = True)
print('After removing the duplicate {} reviews, the dataframe has now {} rows.'\
      .format(shape_before - df.shape[0], df.shape[0]))

After removing the duplicate 47 reviews, the dataframe has now 16738 rows.


fig, ax1 = plt.subplots()

yearmin, yearmax = df.releaseyear.min(), df.releaseyear.max()

sns.histplot(x = 'releaseyear', stat = 'count', data = df, ax = ax1, 
             bins = [k for k in range(yearmin, yearmax + 1)])

ax1.minorticks_on()
ax1.set_yscale('log')
ax1.tick_params(axis='x', labelrotation = 30)

ax2 = ax1.twinx()
ax2.plot(df.groupby('releaseyear').score.agg('mean'), color = 'red', marker = '+', lw = 0.7)
ax2.set_ylabel('Average score of albums');
ax2.grid(None)


plt.title('Distribution of album release years and the average score of albums per year');


df.describe().loc[['mean', '50%', 'min', 'max']]


aberrant_num_columns = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 
                        'liveness', 'valence', 'tempo']

to_drop = df[df[aberrant_num_columns].lt(0.0).any(axis = 1)].shape[0]
df = df[~df[aberrant_num_columns].lt(0.0).any(axis = 1)]
print('We removed {} rows with irregular values.'.format(to_drop))

We removed 0 rows with irregular values.


df.describe().loc[['mean', '50%', 'min', 'max']]


# should we ask about pitch class ?

#I think we should ask about the keys. Because they should be intergers, 
#but it's the case only for 17% of the data
#df.key[df.key.isin([k for k in range(-1,12)])].shape[0]/df.shape[0]
df.key.hist(bins = 100)

<AxesSubplot:>


fig, ax = plt.subplots(4,3,figsize= (14,14))#, sharex = 'row', sharey = 'row')

# list of names of numerical columns
num_cols = df.describe().loc[['mean', '50%', 'min', 'max']].columns.values
for i, elt in enumerate(num_cols):
    sbplt = ax[i//3, i%3]
    sbplt.hist(df[elt])#, bins = 100)
    sbplt.set_title(elt)
    sbplt.set_yscale('log')
    
fig.tight_layout(pad=0.5)
fig.subplots_adjust(top=0.93)

plt.suptitle('Distribution of all numerical features');


to_modif = (df.genre == 'none') | (df.genre.isna())
df.loc[to_modif, 'genre'] = 'Other'

print(f"There were {to_modif.sum()} genre values replaces by 'Other'.")

There were 11 genre values replaces by 'Other'.


# list of categorical columns
cat_cols = df.columns.difference(num_cols)

for cat in cat_cols:
    n = df[cat].unique().size 
    if n <= 10:
        print(f'"{cat}" column has {n} distinct values : \n{", ".join(df[cat].unique())}.')
    else :
        print(f'"{cat}" column has {n} distinct values.')
    print()

"album" column has 16176 distinct values.

"artist" column has 7890 distinct values.

"genre" column has 10 distinct values : 
Electronic, Folk/Country, Rock, Rap, Global, Experimental, Metal, Pop/R&B, Jazz, Other.

"recordlabel" column has 3030 distinct values.

"reviewauthor" column has 554 distinct values.

"reviewdate" column has 4876 distinct values.


sns.histplot(x = 'genre', stat = 'count', data = df[cat_cols])
plt.xticks(rotation=70);
plt.grid(axis = 'x', b = None)

#Y/N ?
#plt.yscale('log')
plt.title('Repartiton of the different genres');


df_test = df.copy()
count_art = df.artist.value_counts()
count_art.name = 'count_art'
df_test = df_test.merge(count_art,right_index=True, left_on='artist', how='inner')
sns.regplot(data=df_test, y = 'score', x='count_art', line_kws={'color': 'red'} )
plt.title('Relation between number of albums of the artist and score of the album.');


onehot = pd.get_dummies(df.genre)
onehot.columns = [c+'_onehot' for c in onehot.columns]
onehot.head()


def numpy_helper(df, cols):
    '''Return the numpy array of selected cols in df.'''
    return df[cols].values


X = numpy_helper(onehot, onehot.columns)
y = numpy_helper(df, 'score')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=123)


class Baseline():
    def __init__(self):
        self.avg = None
        
    def fit(self, X, y):
        self.avg = np.mean(y)
    
    def predict(self, X):
        return self.avg * np.ones((X.shape[0],))


baseline = Baseline()
baseline.fit(X_train, y_train)


r2_score(y_test, baseline.predict(y_test))

-1.0839329994682956e-05


model = LinearRegression()

# data conversion to numpy array and random splitting
df_onehot = pd.merge(df, onehot, left_index=True, right_index=True)
cols = [ "releaseyear", "key", "acousticness", "danceability", 
        "energy", "instrumentalness", "liveness", "loudness", 
        "speechiness", "valence", "tempo"] + list(onehot.columns)
X,y = numpy_helper(df_onehot, cols), numpy_helper(df_onehot, 'score')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=123)

#model training
model.fit(X_train, y_train)

print(f"The R^2 of the linear regression on the random split is {r2_score(y_test, model.predict(X_test)):.4f}.")

The R^2 of the linear regression on the random split is 0.0387.


model = LinearRegression()

#copy to silence setting on copy warnings arising later
df_train = df_onehot[df_onehot.releaseyear < 2000].copy()
df_test = df_onehot[df_onehot.releaseyear >= 2003].copy()

X_train,y_train = numpy_helper(df_train, cols), numpy_helper(df_train, 'score')
X_test ,y_test  = numpy_helper(df_test, cols) , numpy_helper(df_test, 'score')

#model training
model.fit(X_train, y_train)

print(f"The R^2 of the linear regression on the longitudinal split is {r2_score(y_test, model.predict(X_test)):.4f}.")

The R^2 of the linear regression on the longitudinal split is -0.2823.


df_train.loc[:,'Period'] = 'Before 2000'
df_test.loc[:,'Period'] = '2003 or after'
df_plot = pd.concat((df_train, df_test), axis=0)
sns.histplot(df_plot, x='score', hue='Period', common_norm=False, stat='density')
plt.title('Evolution of the distribution of scores before 200 and after 2003.');


y_pred = model.predict(X_test)
sns.histplot(x=y_test - y_pred)
plt.title('Residuals of the 3.2 model on the test set')
plt.xlabel("$Y- Y'$");


#bootstrap CI
N = 10_000
proba = np.zeros(N)
n = y_test.size

for i in range(N):
    idx = rng.integers(0,n,n) # n integers between 0 and n : indices of the samples
    y_sample = y_test[idx]
    X_sample = X_test[idx]
    y_pred = model.predict(X_sample)
    err = np.where(np.abs(y_pred - y_sample) > 2, 1, 0) #1 if |res| > 2, else 0
    proba[i] = err.mean()


sns.histplot(x=proba, binwidth=5e-4)
plt.xlabel('Probability')
plt.title(f'Probability of prediction being off by more than 2 scores points,\n $N = {N}$')
plt.xticks(rotation = 45);


pmin, pmax = np.quantile(proba, [0.025, 0.975])
print(f"With the model trained in 3.2, the error is more than 2 points of score with a probability "+
     f"between {pmin:.4f} and \n{pmax:.4f} (95% C.I., {N} bootstrap samples).")

With the model trained in 3.2, the error is more than 2 points of score with a probability between 0.1042 and 
0.1144 (95% C.I., 10000 bootstrap samples).


model = GradientBoostingRegressor()
model.fit(X_train, y_train)
print(f"The R^2 of the Gradient Boosting Regressor on the longitudinal split"+\
      f"is {r2_score(y_test, model.predict(X_test)):.4f}.")

The R^2 of the Gradient Boosting Regressor on the longitudinal splitis -0.4104.


# find artists with several albums
counts = df.groupby('artist').releaseyear.count()
artists = counts[counts>1].index

# find latest albums. 
latest = df[df.artist.isin(artists)].sort_values('reviewdate', ascending=False).groupby('artist').releaseyear.idxmax()
df_latest = df_onehot.loc[latest]

# find rows about previous albums.
previous = df[df.artist.isin(artists)].index.difference(latest)
df_previous = df_onehot[cols+['score', 'artist']].loc[previous]
avg_previous = df_previous.groupby('artist').mean()

# merge both dataframes
df_latest = df_latest.merge(df_previous, right_on='artist', left_on='artist', suffixes=('','_previous'))


print(f"The resulting dataframe has {df_latest.shape[0]} rows and the following columns :")
print(', '.join(df_latest.columns))

The resulting dataframe has 8840 rows and the following columns :
artist, album, reviewauthor, score, releaseyear, reviewdate, recordlabel, genre, danceability, energy, key, loudness, speechiness, acousticness, instrumentalness, liveness, valence, tempo, Electronic_onehot, Experimental_onehot, Folk/Country_onehot, Global_onehot, Jazz_onehot, Metal_onehot, Other_onehot, Pop/R&B_onehot, Rap_onehot, Rock_onehot, releaseyear_previous, key_previous, acousticness_previous, danceability_previous, energy_previous, instrumentalness_previous, liveness_previous, loudness_previous, speechiness_previous, valence_previous, tempo_previous, Electronic_onehot_previous, Experimental_onehot_previous, Folk/Country_onehot_previous, Global_onehot_previous, Jazz_onehot_previous, Metal_onehot_previous, Other_onehot_previous, Pop/R&B_onehot_previous, Rap_onehot_previous, Rock_onehot_previous, score_previous


model = GradientBoostingRegressor()

# data conversion to numpy array and random splitting
cols_latest = cols + [s+"_previous" for s in cols] + ["score_previous"]
X,y = numpy_helper(df_latest, cols), numpy_helper(df_latest, 'score')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=123)

#model training
model.fit(X_train, y_train)

print(f"The R^2 of the Gradient Boosting regression on the random split with the new features is "+\
      f"{r2_score(y_test, model.predict(X_test)):.4f}.")

The R^2 of the Gradient Boosting regression on the random split with the new features is 0.2272.


def cross_valid(X,y,model, k=5):
    """Perform k_fold cross validation on model.
    Returns an array containing the R^2 obtained on each split"""
    N = y.size
    ids = rng.permutation(N)
    # split the indices in k parts. If N%k != 0, we leave the N%k remaining data points out of the split.
    split = [ids[i * N//k : (i+1) * N//k ] for i in range(k)]
    acc=np.zeros(k)
    for i in range(k):
        id_test = split[i]
        id_train = np.concatenate([split[j] for j in range(k) if j!= i])
        model.fit(X[id_train], y[id_train])
        ypred = model.predict(X[id_test])
        acc[i] = r2_score(y[id_test], ypred)
    return acc


n_est_vals = [100, 200, 300, 400]
lr_vals = [0.1, 0.05, 0.01]
k = 20
CI_cst = 1.96 /np.sqrt(k)

results = []
for n_est in n_est_vals:
    for lr in lr_vals:
        print(n_est, lr)
        model = GradientBoostingRegressor(learning_rate=lr, n_estimators=n_est)
        r2 = cross_valid(X_train, y_train, model, k=k)
        for s in r2 :
            results.append((n_est, lr, s))

cv_results = pd.DataFrame(results, columns = ['n_estimators', 'learning_rate', 'R2'])

100 0.1
100 0.05
100 0.01
200 0.1
200 0.05
200 0.01
300 0.1
300 0.05
300 0.01
400 0.1
400 0.05
400 0.01


r2_stats = cv_results.groupby(['n_estimators','learning_rate']).agg([np.mean, np.std])
idmax, meanmax = r2_stats[('R2','mean')].idxmax(), r2_stats[('R2','mean')].max()
r2_ci = r2_stats[('R2','std')][idmax]*CI_cst
print(f"The best R^2 score is {meanmax:.3f} +/- {r2_ci:.3f} (95% C.I.),\n"+\
      f"reached for (n_estimators, learnin_rate) = {idmax}.")

The best R^2 score is 0.391 +/- 0.025 (95% C.I.),
reached for (n_estimators, learnin_rate) = (400, 0.1).


sns.barplot(data=cv_results, x='n_estimators',hue='learning_rate',y='R2',ci=95)
plt.title('$R^2$ of a Gradient Boosting regression with different hyperparameters');
plt.ylabel('$R^2$');


n_est, lr = idmax
model = GradientBoostingRegressor(learning_rate=lr, n_estimators=n_est)
model.fit(X_train, y_train)
r2_best = r2_score(y_test,model.predict(X_test))
print(f'The R^2 score of the model with the best hyperparameters (lr = {lr}, n_estimators = {n_est})'+\
      f'\non the test set is {r2_best:.4f}.')

The R^2 score of the model with the best hyperparameters (lr = 0.1, n_estimators = 400)
on the test set is 0.4081.


# We merged the cells of the Task1 in this one 
# Import the Database
df = pd.read_csv(DATA_FOLDER + 'pitchfork.csv.gz', compression='gzip', dtype={'releaseyear': int}, 
                 usecols = ['artist', 'album', 'score', 'releaseyear', 'reviewdate', 'danceability', 
                            'energy', 'key', 'loudness','speechiness', 'acousticness', 'instrumentalness', 
                            'liveness', 'valence', 'tempo'], parse_dates=['reviewdate'])

# Drop the duplicates
shape_before = df.shape[0]
df.drop_duplicates(subset = ['artist','album'], keep = 'first', inplace = True)
print('After removing the duplicate reviews, the dataframe size decreased from {} to {} rows.'\
      .format(shape_before, df.shape[0]))

# Remove the irregular rows
aberrant_num_columns = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 
                        'liveness', 'valence', 'tempo']
to_drop = df[df[aberrant_num_columns].lt(0.0).any(axis = 1)].shape[0]
df = df[~df[aberrant_num_columns].lt(0.0).any(axis = 1)]
print('We removed {} rows with irregular values.'.format(to_drop))

# Keep artists that produced more than 1 album
artists_count_album = df.groupby('artist').agg('count').album
artists_more_1_album = artists_count_album[artists_count_album >1]
df = df[df.artist.isin(artists_more_1_album.index.values)]
print('We removed {} artists that produced only one album.'.format(artists_count_album[artists_count_album<2].shape[0]))

# Checking that the release years are well-known
df_releaseyear_isnull = df[df.releaseyear.isnull()]
print('There are {} album which releaseyear is empty.\n'.format(df_releaseyear_isnull.shape[0]))

print('To conclude, we have a database of {} albums, composed by {} different artists.'.format(df.shape[0],df.artist.unique().shape[0]))

After removing the duplicate reviews, the dataframe size decreased from 16785 to 16738 rows.
We removed 8 rows with irregular values.
We removed 4329 artists that produced only one album.
There are 0 album which releaseyear is empty.

To conclude, we have a database of 12401 albums, composed by 3561 different artists.


#The df is timely sorted and grouped by artists and we use the function `cumcount`
df['album_number'] = df.sort_values(['releaseyear', 'reviewdate']).groupby('artist').cumcount()

#One exemple
df[df.artist == 'Muse'].sort_values(['releaseyear', 'reviewdate'])[['artist','album', 'releaseyear', 
                                                                    'reviewdate', 'album_number']]


#The first part counts the number of different `releaseyear` per group
#The secound one counts the number of `releaseyear` per group
#grp is a boolean Series that indicates for each group if it produced more than 1 album in 1 year

grp = df[df.album_number.isin([0,1])].groupby('artist')['releaseyear'].nunique() != \
df[df.album_number.isin([0,1])].groupby('artist')['releaseyear'].count()

print('There are {} groupes whose 2 first albums were realeased the same year.'\
      .format(grp[grp == True].shape[0]))

#One exemple
df[df.artist == '03 Greedo'].sort_values(['releaseyear', 'reviewdate'])[['artist','album', 'releaseyear', 
                                                                    'reviewdate', 'album_number']]

There are 175 groupes whose 2 first albums were realeased the same year.


df[['score','album_number']].groupby('album_number').agg(['mean', 'std']).iloc[[0,1]]


flierprops = dict(marker='.', markerfacecolor='g', markersize=2, linestyle='none', markeredgecolor='r')
sns.boxplot(x = 'album_number', y = 'score', data = df[df.album_number.isin([0,1])], 
            flierprops = flierprops, width = 0.5, showmeans=True);

legend_elements = [Line2D([0], [0], marker='^', color='w', label='Mean', markerfacecolor='g', markersize=10)]
plt.legend(handles=legend_elements)

plt.xticks([0,1], ['First', 'Second']);
plt.xlabel('Album ');
plt.title('Distribution of the scores of the first and second albums');


ttest, pvalue, degf = sm.stats.ttest_ind(df[df.album_number == 0].score, df[df.album_number == 1].score)


print('We decided to apply a t-test, the p-value is about {:.3E},ie we can reject the'.format(pvalue) +\
' null hypothesis \n(the 2 tested populations have the same score mean) with an risk error smaller than 99%.')

We decided to apply a t-test, the p-value is about 7.726E-19,ie we can reject the null hypothesis 
(the 2 tested populations have the same score mean) with an risk error smaller than 99%.


# 1. Merging
df2 = df[df.album_number ==0].merge(df[df.album_number ==1], how='inner', on='artist', 
                                    suffixes=('', '_1'), validate = "1:1")

df2.drop(columns=['album_number', 'album_number_1'], inplace=True)

# 2. Score_diff
df2['score_diff'] = df2.score_1 - df2.score
df2.drop(columns=['score_1', 'score'], inplace=True)

# 3. Time-diff
df2['time_diff'] = (df2.releaseyear_1 - df2.releaseyear)*365
df2.drop(columns=['releaseyear_1', 'releaseyear'], inplace=True)


# 4. Function computing the distance of 2 albums

#The numerical features
num_feat = ["key", "acousticness", "danceability", "energy", "instrumentalness", 
                "liveness", "loudness", "speechiness", "valence", "tempo"]
num_feat_1 = [k + '_1' for k in num_feat]

#variances for the standardized euclidean distance
var = df[df.album_number.isin([0,1])][num_feat].var()

def euclidean_dist(albums):
    u, v = albums[num_feat], albums[num_feat_1]
    
    return seuclidean(u,v, var)


# 5. Application
df2['euclidean_dist'] = df2.apply(euclidean_dist, axis = 1)

# 6. Creation of the column 
df2['did_style_change'] = 0
lim = round(df2.shape[0]*0.2)

index_1 = df2.sort_values('euclidean_dist', ascending = False).iloc[0:lim].index
df2.loc[index_1, 'did_style_change'] = 1

df2.drop(columns = num_feat + num_feat_1 + ['euclidean_dist'], inplace = True)


# Declare the model
mod = smf.ols(formula='score_diff ~ 1', data=df2)

# Fits the model (find the optimal coefficients, adding a random seed ensures consistency)
np.random.seed(2)
res = mod.fit()

# Print thes summary output provided by the library.
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             score_diff   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                       nan
Date:                Fri, 19 Nov 2021   Prob (F-statistic):                nan
Time:                        20:44:45   Log-Likelihood:                -6195.2
No. Observations:                3561   AIC:                         1.239e+04
Df Residuals:                    3560   BIC:                         1.240e+04
Df Model:                           0                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.2643      0.023    -11.440      0.000      -0.310      -0.219
==============================================================================
Omnibus:                      400.832   Durbin-Watson:                   1.998
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2191.611
Skew:                          -0.394   Prob(JB):                         0.00
Kurtosis:                       6.762   Cond. No.                         1.00
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# Declare the model
mod = smf.ols(formula='score_diff ~ time_diff + C(did_style_change)', data=df2)

# Fits the model (find the optimal coefficients, adding a random seed ensures consistency)
np.random.seed(2)
res = mod.fit()

# Print thes summary output provided by the library.
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             score_diff   R-squared:                       0.018
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     31.73
Date:                Fri, 19 Nov 2021   Prob (F-statistic):           2.19e-14
Time:                        20:44:49   Log-Likelihood:                -6163.7
No. Observations:                3561   AIC:                         1.233e+04
Df Residuals:                    3558   BIC:                         1.235e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
============================================================================================
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                   -0.1349      0.029     -4.586      0.000      -0.193      -0.077
C(did_style_change)[T.1]    -0.0673      0.057     -1.175      0.240      -0.180       0.045
time_diff                -8.824e-05   1.12e-05     -7.849      0.000      -0.000   -6.62e-05
==============================================================================
Omnibus:                      371.065   Durbin-Watson:                   2.002
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2056.531
Skew:                          -0.336   Prob(JB):                         0.00
Kurtosis:                       6.662   Cond. No.                     6.21e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.21e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


df2['time_diff_standardized'] = (df2.time_diff-df2.time_diff.mean())/df2.time_diff.std()

# Declare the model
mod = smf.ols(formula='score_diff ~ time_diff_standardized + C(did_style_change)', data=df2)

# Fits the model (find the optimal coefficients, adding a random seed ensures consistency)
np.random.seed(2)
res = mod.fit()

# Print thes summary output provided by the library.
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             score_diff   R-squared:                       0.018
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     31.73
Date:                Fri, 19 Nov 2021   Prob (F-statistic):           2.19e-14
Time:                        20:44:52   Log-Likelihood:                -6163.7
No. Observations:                3561   AIC:                         1.233e+04
Df Residuals:                    3558   BIC:                         1.235e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
============================================================================================
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                   -0.2508      0.026     -9.795      0.000      -0.301      -0.201
C(did_style_change)[T.1]    -0.0673      0.057     -1.175      0.240      -0.180       0.045
time_diff_standardized      -0.1798      0.023     -7.849      0.000      -0.225      -0.135
==============================================================================
Omnibus:                      371.065   Durbin-Watson:                   2.002
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2056.531
Skew:                          -0.336   Prob(JB):                         0.00
Kurtosis:                       6.662   Cond. No.                         2.62
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


df2.time_diff.std()

2038.0643772751785


diff = df[df.album_number.isin([0,1])].groupby('artist')['releaseyear'].nunique() != \
df[df.album_number.isin([0,1])].groupby('artist')['releaseyear'].count()
to_drop = diff[diff].index

print(f'There are {len(to_drop)} artists that produced more than 1 album in 1 year.\n')
      
df_bis = df[~df.artist.isin(to_drop)]

There are 175 artists that produced more than 1 album in 1 year.


ttest_2, pvalue_2, degf_2 = sm.stats.ttest_ind(df_bis[df_bis.album_number == 0].score, df_bis[df_bis.album_number == 1].score)


print('We the new data, the p value passes from {:.3E} to {:.3E}.'.format(pvalue, pvalue_2))

We the new data, the p value passes from 7.726E-19 to 2.258E-19.


# 1. Merging
df2 = df_bis[df_bis.album_number ==0].merge(df_bis[df_bis.album_number ==1], how='inner', on='artist', 
                                    suffixes=('', '_1'), validate = "1:1")

df2.drop(columns=['album_number', 'album_number_1'], inplace=True)

# 2. Score_diff
df2['score_diff'] = df2.score_1 - df2.score
df2.drop(columns=['score_1', 'score'], inplace=True)

# 3. Time-diff
df2['time_diff'] = (df2.releaseyear_1 - df2.releaseyear)*365
df2.drop(columns=['releaseyear_1', 'releaseyear'], inplace=True)

# 4. Function computing the distance of 2 albums
#The numerical features
num_feat = ["key", "acousticness", "danceability", "energy", "instrumentalness", 
                "liveness", "loudness", "speechiness", "valence", "tempo"]
num_feat_1 = [k + '_1' for k in num_feat]

#variances for the standardized euclidean distance
var = df[df.album_number.isin([0,1])][num_feat].var()

# 5. Application
df2['euclidean_dist'] = df2.apply(euclidean_dist, axis = 1)

# 6. Creation of the column 
df2['did_style_change'] = 0
lim = round(df2.shape[0]*0.2)

index_1 = df2.sort_values('euclidean_dist', ascending = False).iloc[0:lim].index
df2.loc[index_1, 'did_style_change'] = 1

df2.drop(columns = num_feat + num_feat_1 + ['euclidean_dist'], inplace = True)

# 7. Normalization
df2['time_diff_standardized'] = (df2.time_diff-df2.time_diff.mean())/df2.time_diff.std()

###################

# Declare the model
mod = smf.ols(formula='score_diff ~ time_diff_standardized + C(did_style_change)', data=df2)

# Fits the model (find the optimal coefficients, adding a random seed ensures consistency)
np.random.seed(2)
res = mod.fit()

# Print thes summary output provided by the library.
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             score_diff   R-squared:                       0.017
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     29.74
Date:                Fri, 19 Nov 2021   Prob (F-statistic):           1.57e-13
Time:                        20:46:27   Log-Likelihood:                -5856.9
No. Observations:                3386   AIC:                         1.172e+04
Df Residuals:                    3383   BIC:                         1.174e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
============================================================================================
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                   -0.2668      0.026    -10.170      0.000      -0.318      -0.215
C(did_style_change)[T.1]    -0.0384      0.059     -0.654      0.513      -0.153       0.077
time_diff_standardized      -0.1799      0.023     -7.662      0.000      -0.226      -0.134
==============================================================================
Omnibus:                      325.655   Durbin-Watson:                   2.010
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1766.036
Skew:                          -0.288   Prob(JB):                         0.00
Kurtosis:                       6.491   Cond. No.                         2.62
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


df_1to4 = df[df.album_number >2][['score', 'album_number']]
mean = df_1to4.groupby('album_number').agg('mean').iloc[0:4]
mean['std'] = mean.score.std()
mean


count_album = df.groupby('artist').album.count()
artist_to_keep = count_album[count_album>3].index.values
df_1to4 = df[df.artist.isin(artist_to_keep)][['score', 'album_number']]
mean = df_1to4.groupby('album_number').agg('mean').iloc[0:4]
mean['std'] = mean.score.std()
mean


talent = np.random.uniform(low=2, high=8, size=1000)
for k in range(1,4):
    exec(f'album_{k} = np.random.normal(loc=talent, scale=1.0, size=None)')


df_test = pd.DataFrame(data = np.array([talent, album_1, album_2, album_3]).T, 
                       columns = ['talent', 'album_1', 'album_2', 'album_3'])
df_test = df_test[df_test.album_1 >= 6]


mean_test =  df_test.iloc[:,1:].agg('mean')
mean_test['std'] = mean_test.std()
pd.DataFrame(mean_test).T


talent = np.random.uniform(low=2, high=8, size=1000)
for k in range(1,4):
    exec(f'album_{k} = np.random.normal(loc=talent, scale=1.0, size=None)')    
df_test = pd.DataFrame(data = np.array([talent, album_1, album_2, album_3]).T, 
                       columns = ['talent', 'album_1', 'album_2', 'album_3'])
df_test = df_test[df_test.album_1 >= 6]
mean_test =  df_test.iloc[:,1:].agg('mean')
mean_test['std'] = mean_test.std()
mean_test = pd.DataFrame(mean_test).T


for loop in range(1000):
    talent = np.random.uniform(low=2, high=8, size=1000)
    for k in range(1,4):
        exec(f'album_{k} = np.random.normal(loc=talent, scale=1.0, size=None)')    
    df_test = pd.DataFrame(data = np.array([talent, album_1, album_2, album_3]).T, 
                           columns = ['talent', 'album_1', 'album_2', 'album_3'])
    df_test = df_test[df_test.album_1 >= 6]
    mean_test_to_add =  df_test.iloc[:,1:].agg('mean')
    mean_test_to_add['std'] = mean_test_to_add.std()
    mean_test = mean_test.append(mean_test_to_add, ignore_index=True)


mean_test.iloc[:,]


mean_test.columns.values

array(['album_1', 'album_2', 'album_3', 'std'], dtype=object)

Column	Description
`artist`	The name of the artist who created the album being reviewed.
`album`	The name of the album being reviewed.
`recordlabel`	The name of the record label(s) who published the album.
`releaseyear`	The year that the album was released.
`score`	The score given to the album by the reviewer on a scale of 0.0 to 10.0.
`reviewauthor`	The name of the author who reviewed the album.
`genre`	The genre assigned to the album by Pitchfork.
`reviewdate`	The date that the review was published.
`key`	The estimated overall musical key of the track. Integers map to pitches using standard Pitch Class notation (e.g., 0 = C, 2 = D, and so on)
`acousticness`	A confidence measure from 0.0 to 1.0 of whether an album is acoustic. 1.0 represents high confidencethat the album is acoustic.
`danceability`	How suitable an album is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 1.0 is most danceable.
`energy`	A perceptual measure of intensity and activity, from 0.0 to 1.0, where 1.0 represents high energy. Metal is often high energy.
`instrumentalness`	Predicts whether an album contains no vocals, from 0.0 to 1.0. The closer to 1.0, the more likely the album contains no vocals.
`liveness`	Detects the presence of an audience, from 0.0 to 1.0. Scores greater than 0.8 indicate a strong likelihood the album is live.
`loudness`	The overall loudness of the album in decibels (dB).
`speechiness`	Measures the presence of spoken words in an album on a scale from 0.0 to 1.0. Scores higher than 0.66 indicate an album made entirely of spoken words, while scores below 0.33 indicate music and other non-speech-like elements.
`valence`	A measure from 0.0 to 1.0 describing the musical positiveness conveyed by an album, where values closer to 1.0 indicate more positive sounds.
`tempo`	The overall estimated tempo of an album in beats per minute (BPM).

	score	releaseyear	danceability	energy	key	loudness	speechiness	acousticness	instrumentalness	liveness	valence	tempo
mean	7.048596	2009.346338	0.512334	0.601276	5.216501	-9.283268	0.090742	0.301914	0.274748	0.196402	0.405268	120.326487
50%	7.300000	2010.000000	0.511348	0.624722	5.230769	-8.444263	0.056665	0.228844	0.149363	0.174261	0.406288	120.397346
min	0.000000	1957.000000	-1.000000	-1.000000	-1.000000	-51.728750	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000
max	10.000000	2019.000000	0.974000	0.999000	11.000000	4.078000	0.958000	0.996000	0.982000	0.978000	0.971000	215.972000

	score	releaseyear	danceability	energy	key	loudness	speechiness	acousticness	instrumentalness	liveness	valence	tempo
mean	7.048536	2009.345965	0.512967	0.601951	5.219034	-9.286912	0.091208	0.302476	0.275303	0.196910	0.405853	120.377216
50%	7.300000	2010.000000	0.511364	0.624823	5.230769	-8.446428	0.056681	0.229000	0.149641	0.174304	0.406300	120.406490
min	0.000000	1957.000000	0.038667	0.000126	0.000000	-51.728750	0.008644	0.000001	0.000000	0.015300	0.000010	23.983333
max	10.000000	2019.000000	0.974000	0.999000	11.000000	4.078000	0.958000	0.996000	0.982000	0.978000	0.971000	215.972000

	artist	album	releaseyear	reviewdate	album_number
11361	Muse	Black Holes and Revelations	2006	2006-07-05	0
8963	Muse	H.A.A.R.P.	2008	2008-04-16	1
96	Muse	The Resistance	2009	2009-09-15	2
15897	Muse	The 2nd Law	2012	2012-10-02	3
4014	Muse	Drones	2015	2015-06-09	4
1866	Muse	Simulation Theory	2018	2018-11-15	5

	artist	album	releaseyear	reviewdate	album_number
11582	03 Greedo	The Wolf of Grape Street	2018	2018-03-15	0
15027	03 Greedo	God Level	2018	2018-07-05	1
15531	03 Greedo	Still Summer in the Projects	2019	2019-04-24	2
16226	03 Greedo	Netflix & Deal	2019	2019-12-05	3

Homework 2 (HW2)¶

Important Dates¶

Some rules¶

Context¶

The data¶

Part 1: Will this album be a hit?¶

Task 2 (Pre-processing and pipeline code — 12 pts)¶

Part 2: Second Album Syndrome¶

	score	std
album_number
3	7.040662	0.060296
4	6.926147	0.060296
5	7.059685	0.060296
6	7.033887	0.060296

	score	std
album_number
0	7.541851	0.218472
1	7.343803	0.218472
2	7.163837	0.218472
3	7.040662	0.218472

	album_1	album_2	album_3	std
0	7.251298	6.825341	6.777829	0.260726
1	7.193290	6.739608	6.722924	0.266880

	Electronic_onehot	Folk/Country_onehot	Rap_onehot	Rock_onehot
0	1	0	0	0
1	0	1	0	0
2	1	0	0	0
3	0	0	0	1
4	0	0	1	0

	score
	mean	std
album_number
0	7.303229	1.235374
1	7.038978	1.273217