The Freemium business model¶

High Note is an anonymized real music streaming company similar to Last.fm, Spotify or Pandora which uses a freemium business model. The expectation is to use the High Note customer data, and build a "free to fee" strategy to convert free accounts to premium subscribers.

We start of by importing the below packages for the functionalities as commented below.

library(dplyr)             # Package for Data preprocessing
library(psych)             # Package for summarising data in tables
library(ggplot2)           # Package for Visualisations
library(MatchIt)           # Package for Matching based on propensity score
library(corrplot)          # Package for Correlation plots
library(gridExtra)         # Package for plotting multiple plots
library(ggExtra)           # Package for Visual Enhancement
library(car)               # Package for VIF
options(scipen = 999)      # To ensure results are not in exponential

Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:psych':

    logit

The following object is masked from 'package:dplyr':

    recode

We change the working directory, import the csv file and check the dimensions of the dataframe

getwd()
setwd('C:/Users/Nijanth Anand/Downloads/BANA277-Customer Analytics/Assignment - MidTerm')
data=read.csv("HighNote Data Midterm.csv",na.strings=c(""," ","NA"))
print('Dimension of data is')
print(paste('Row Count:',dim(data)[1]))
print(paste('Column Count:',dim(data)[2]))

[1] "Dimension of data is"
[1] "Row Count: 43827"
[1] "Column Count: 16"

We check if there are NA Values

sum(is.na(data))

Since there are no NA values, we work on the descriptive statistics based on the adopter value as stated in the reading

xyz<- data[data$adopter==1,c(2,3,4,8,9,12,11,13,10,7)] 
a<-psych::describe(xyz,skew=FALSE)
xyz<- data[data$adopter==0,c(2,3,4,8,9,12,11,13,10,7)] 
b<-psych::describe(xyz,skew=FALSE)
print("For Adopter =1")
a
print("For Adopter =0")
b
#data[data$adopter==0,-1] %>% psych::describe(,skew=FALSE,mat=TRUE)

[1] "For Adopter =1"

[1] "For Adopter =0"

We can see there is a difference in means between adopter=0 and adopter=1 grouped records, for instance the average songs listened by adopters is 33K while non adopters have listened to about 17K songs which is half of it.

We assign ID attribute to NULL since we will not be using it further for the modelling and descriptive analytics

data$ID<- NULL

Descriptive Statistics¶

Assign a new data frame for descriptive statistics computation since we will be factorising variables

data_desc<- data

We make the adopter variable a String Factor to help visualize how adopters and non-adopters differ from each other.

data_desc$paid <- ifelse(data_desc$adopter==1,'Subscriber','Free User')
table(data_desc$paid)

 Free User Subscriber 
     40300       3527

Correlation Analysis¶

str(data)

'data.frame':	43827 obs. of  15 variables:
 $ age                  : int  22 35 27 21 24 21 20 23 24 34 ...
 $ male                 : int  0 0 1 0 0 1 0 1 0 1 ...
 $ friend_cnt           : int  8 2 2 28 65 12 15 57 4 13 ...
 $ avg_friend_age       : num  22.6 28 23 22.9 22.3 ...
 $ avg_friend_male      : num  0.429 1 1 0.5 0.914 ...
 $ friend_country_cnt   : int  1 2 1 7 9 1 1 14 1 3 ...
 $ subscriber_friend_cnt: int  0 0 0 1 0 0 0 1 0 0 ...
 $ songsListened        : int  9687 0 508 1357 89984 124547 24852 99877 6125 15997 ...
 $ lovedTracks          : int  194 0 0 32 20 10 391 125 42 82 ...
 $ posts                : int  0 0 0 0 2 0 6 89 0 0 ...
 $ playlists            : int  1 0 1 0 0 1 1 1 0 1 ...
 $ shouts               : int  8 0 2 1 81 2 67 44 5 3 ...
 $ adopter              : int  0 0 0 0 0 0 0 0 0 0 ...
 $ tenure               : int  59 35 42 25 67 53 56 71 34 49 ...
 $ good_country         : int  1 0 0 0 0 1 1 0 1 1 ...

Correlation Analysis of Demographics¶

cor_data<-data[,c('age','male','good_country','friend_cnt','adopter')]
M=cor(cor_data,method = c("pearson", "kendall", "spearman"))
corrplot(M,method="number",addrect=2)  ##circle

Correlation Analysis of Peer Influence¶

cor_data<-data[,c('friend_cnt','avg_friend_age','avg_friend_male','friend_country_cnt','subscriber_friend_cnt','adopter')]
M=cor(cor_data,method = c("pearson", "kendall", "spearman"))
corrplot(M,method="number",addrect=2)  ##circle

Correlation Analysis of User Engagement¶

cor_data<-data[,c('songsListened','lovedTracks','posts','playlists','shouts','tenure','adopter')]
M=cor(cor_data,method = c("pearson", "kendall", "spearman"))
corrplot(M,method="number",addrect=2)  ##circle

Descriptive Graphs for Demographics¶

Plot between friends count and age which is studied for users based on country and adopter status.

labs <- paste("Country Type:", c("Good", "Bad"))
labs2 <- paste(c("Paid User", "Free User"))
data %>% mutate(good_country = ifelse(good_country == 1, labs[1], labs[2]))%>%
  mutate(adopter = ifelse(adopter == 1, labs2[1], labs2[2]))%>%
  ggplot(aes(x=age, y=friend_cnt,shape=adopter, color=adopter)) + geom_point(size=2)+
  facet_wrap(~good_country)+ylim(0,2500)

Warning message:
"Removed 3 rows containing missing values (geom_point)."

We can conclude that the number of friends is relatively higher in bad countries and paid users are normally distributed along age specifically between th20 and 40.

Box Plots for Demographics to study the distribution

data_desc$paid=as.factor(data_desc$paid)
data_desc$male=as.factor(data_desc$male)
data_desc$good_country=as.factor(data_desc$good_country)
print("Box Plot for Demographic understanding to help visualize how adopters and non-adopters perform")
require(gridExtra)
abc1 <- ggplot(data_desc, aes(x=paid, y=age)) + geom_boxplot(aes(fill=paid),outlier.colour="red", outlier.shape=8,outlier.size=1)
abc1 <- abc1 + scale_fill_manual(values=c("#FF4500", "#32CD32")) +theme(legend.position="none")+theme(axis.title.x=element_blank())
abc2<- ggplot(data_desc, aes(x=age)) + geom_bar()
abc3<- ggplot(data_desc, aes(x=age)) + geom_density(kernel='gaussian')

data1<-data_desc[data_desc$male==0,]
abc4 <- ggplot(data1, aes(x=paid)) +labs(title = "Female Users")+ geom_bar(aes(fill=paid) )+theme(axis.title.x=element_blank())+ scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")
data2<-data_desc[data_desc$male==1,]
abc5 <- ggplot(data2, aes(x=paid)) +labs(title = "Male Users")+ geom_bar(aes(fill=paid) )+theme(axis.title.x=element_blank())+ scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none") #geom_bar(stat="identity") #geom_histogram()
abc6 <- ggplot(data_desc, aes(paid))
abc6 <- abc6  + geom_bar(aes(fill = male))+ scale_fill_manual(values=c("#808080", "#FFD700"))+theme(axis.title.x=element_blank())


data1<-data_desc[data_desc$good_country==0,]
abc7 <- ggplot(data1, aes(x=paid)) +labs(title = "Bad Country Users")+ geom_bar(aes(fill=paid) )+theme(axis.title.x=element_blank())+ scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")
data2<-data_desc[data_desc$good_country==1,]
abc8 <- ggplot(data2, aes(x=paid)) +labs(title = "Good Country Users")+ geom_bar(aes(fill=paid) ) +theme(axis.title.x=element_blank())+ scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")#geom_bar(stat="identity") #geom_histogram()
data_descx=mutate(data_desc,country=good_country)
abc9 <- ggplot(data_descx, aes(paid))
abc9 <- abc9  + geom_bar(aes(fill = country))+ scale_fill_manual(values=c("#808080", "#FFD700"))+theme(axis.title.x=element_blank())

grid.arrange(abc1,abc2,abc3,abc4,abc5,abc6,abc7,abc8,abc9, ncol=3)

[1] "Box Plot for Demographic understanding to help visualize how adopters and non-adopters perform"

Descriptive Graphs for Peer Influence¶

Plot between friend count and subscriber count to understand if having more friends leads to more subscriber friends based on gender and adopter.

labs <- paste("Gender:", c("Male", "Female"))
labs2 <- paste(c("Paid User", "Free User"))
data %>% mutate(male = ifelse(male == 1, labs[1], labs[2]))%>%
  mutate(adopter = ifelse(adopter == 1, labs2[1], labs2[2]))%>%
  ggplot(aes(x=friend_cnt, y=subscriber_friend_cnt,shape=male, color=male)) + geom_point(size=2)+
  facet_wrap(~adopter)+ylim(0,100)

Warning message:
"Removed 4 rows containing missing values (geom_point)."

We observe that the Paid users have lesser friends, compared to free users and gender doesn't have much difference in it.

We plot the Box plots to understand the spread of the variables across adopters and non-adopters

print("Box Plot for Peer Influence understanding to help visualize how adopters and non-adopters perform")
abc2 <- ggplot(data_desc, aes(x=paid, y=friend_cnt)) + geom_boxplot(aes(fill=paid),outlier.colour="red", outlier.shape=8,outlier.size=1)
abc2 <- abc2 + scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")+scale_y_continuous(limits = quantile(data_desc$friend_cnt, c(0.1, 0.9)))+theme(axis.title.x=element_blank())
abc3 <- ggplot(data_desc, aes(x=paid, y=avg_friend_age)) + geom_boxplot(aes(fill=paid),outlier.colour="red", outlier.shape=8,outlier.size=1)
abc3 <- abc3 + scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")+scale_y_continuous(limits = quantile(data_desc$avg_friend_age, c(0.1, 0.9)))+theme(axis.title.x=element_blank())
abc4 <- ggplot(data_desc, aes(x=paid, y=avg_friend_male)) + geom_boxplot(aes(fill=paid),outlier.colour="red", outlier.shape=8,outlier.size=1)
abc4 <- abc4 + scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")+scale_y_continuous(limits = quantile(data_desc$avg_friend_male, c(0.1, 0.9)))+theme(axis.title.x=element_blank())
abc5 <- ggplot(data_desc, aes(x=paid, y=friend_country_cnt)) + geom_boxplot(aes(fill=paid),outlier.colour="red", outlier.shape=8,outlier.size=1)
abc5 <- abc5 + scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")+scale_y_continuous(limits = quantile(data_desc$friend_country_cnt, c(0.1, 0.9)))+theme(axis.title.x=element_blank())
abc6 <- ggplot(data_desc, aes(x=paid, y=subscriber_friend_cnt)) + geom_boxplot(aes(fill=paid),outlier.colour="red", outlier.shape=8,outlier.size=1)
abc6 <- abc6 + scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")+scale_y_continuous(limits = quantile(data_desc$subscriber_friend_cnt, c(0.1, 0.9)))+theme(axis.title.x=element_blank())
grid.arrange(abc2,abc3,abc4,abc5,abc6, ncol=3)

[1] "Box Plot for Peer Influence understanding to help visualize how adopters and non-adopters perform"

Warning message:
"Removed 4365 rows containing non-finite values (stat_boxplot)."Warning message:
"Removed 8211 rows containing non-finite values (stat_boxplot)."Warning message:
"Removed 4268 rows containing non-finite values (stat_boxplot)."Warning message:
"Removed 3995 rows containing non-finite values (stat_boxplot)."

Descriptive Graphs for User Engagement¶

Plot between songs listened and loved to understand based on gender and adopter to check if user engagement changes based on the said covariants

labs <- paste("Gender:", c("Male", "Female"))
labs2 <- paste(c("Paid User", "Free User"))
data %>% mutate(male = ifelse(male == 1, labs[1], labs[2]))%>%
  mutate(adopter = ifelse(adopter == 1, labs2[1], labs2[2]))%>%
  ggplot(aes(x=songsListened, y=lovedTracks,shape=adopter, color=adopter)) + geom_point(size=2)+
  facet_wrap(~male)#+ylim(0,100)

We can observe that in the given sample, male users listen to more number of songs compared to female users and the users who have listened to the maximum number of songs or at the top 95th percentile aren't essentially paid customers.

Plot between songs listened and playlists to understand based on gender and adopter to check if user engagement changes based on the said covariants

labs <- paste("Gender:", c("Male", "Female"))
labs2 <- paste(c("Paid User", "Free User"))
data %>% mutate(male = ifelse(male == 1, labs[1], labs[2]))%>%
  mutate(adopter = ifelse(adopter == 1, labs2[1], labs2[2]))%>%
  ggplot(aes(x=songsListened, y=playlists,shape=adopter, color=adopter)) + geom_point(size=2)+
  facet_wrap(~male) #+ylim(0,150)

The number of playlists doesn't essentially depend on the the count of songs listened to but in general males have higher counts of playlists in comparision to female users. Similar to the previous graph, users who have the highest engagement with maximum number of playlists or being in the top 95th percentile doesn't essentially guarantee they are paid customers.

We plot the Box plots to understand the spread of the variables across adopters and non-adopters

print("Box Plot for User Engagement Influence understanding to help visualize how adopters and non-adopters perform")
abc7 <- ggplot(data_desc, aes(x=paid, y=songsListened)) + geom_boxplot(aes(fill=paid),outlier.colour="red", outlier.shape=8,outlier.size=1)
abc7 <- abc7 + scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")+scale_y_continuous(limits = quantile(data_desc$songsListened, c(0.1, 0.9)))+theme(axis.title.x=element_blank())
abc8 <- ggplot(data_desc, aes(x=paid, y=lovedTracks)) + geom_boxplot(aes(fill=paid),outlier.colour="red", outlier.shape=8,outlier.size=1)
abc8 <- abc8 + scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")+scale_y_continuous(limits = quantile(data_desc$lovedTracks, c(0.1, 0.9)))+theme(axis.title.x=element_blank())
abc9 <- ggplot(data_desc, aes(x=paid, y=posts)) + geom_boxplot(aes(fill=paid),outlier.colour="red", outlier.shape=8,outlier.size=1)
abc9 <- abc9 + scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")+scale_y_continuous(limits = quantile(data_desc$posts, c(0.1, 0.9)))+theme(axis.title.x=element_blank())
abc10 <- ggplot(data_desc, aes(x=paid, y=playlists)) + geom_boxplot(aes(fill=paid),outlier.colour="red", outlier.shape=8,outlier.size=1)
abc10 <- abc10 + scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")+scale_y_continuous(limits = quantile(data_desc$playlists, c(0, 0.95)))
abc11 <- ggplot(data_desc, aes(x=paid, y=shouts)) + geom_boxplot(aes(fill=paid),outlier.colour="red", outlier.shape=8,outlier.size=1)
abc11 <- abc11 + scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")+scale_y_continuous(limits = quantile(data_desc$shouts, c(0.1, 0.9)))
abc12 <- ggplot(data_desc, aes(x=paid, y=tenure)) + geom_boxplot(aes(fill=paid),outlier.colour="red", outlier.shape=8,outlier.size=1)
abc12 <- abc12 + scale_fill_manual(values=c("#FF4500", "#32CD32"))+theme(legend.position="none")+scale_y_continuous(limits = quantile(data_desc$tenure, c(0.1, 0.9)))
grid.arrange(abc7,abc8,abc9,abc10,abc11,abc12, ncol=3)

[1] "Box Plot for User Engagement Influence understanding to help visualize how adopters and non-adopters perform"

Warning message:
"Removed 8757 rows containing non-finite values (stat_boxplot)."Warning message:
"Removed 4371 rows containing non-finite values (stat_boxplot)."Warning message:
"Removed 3976 rows containing non-finite values (stat_boxplot)."Warning message:
"Removed 744 rows containing non-finite values (stat_boxplot)."Warning message:
"Removed 7926 rows containing non-finite values (stat_boxplot)."Warning message:
"Removed 8533 rows containing non-finite values (stat_boxplot)."

We plot the variation of parameters based on the user engagement.

pairs(data[,c(8:12,14)])

Correlation plot of all predictors

M=cor(data,method = c("pearson", "kendall", "spearman"))
corrplot(M,method="number",addrect=2)

We define treatment as users who have one or more than 1 subscriber friends and study the variation of tratment across users from the two adopter classes

data <- mutate(data,treatment=ifelse(data$subscriber_friend_cnt >=1,1,0))
data %>%
  group_by(adopter) %>% summarise(mean_treatment = mean(treatment),users=n())

There seems to be a difference in the treatment for the two class of adopters since the means are not similar so we run a t-test to find if there is any significance.

with(data, t.test(treatment ~ adopter))

	Welch Two Sample t-test

data:  treatment by adopter
t = -33.978, df = 3931.7, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.3109641 -0.2770354
sample estimates:
mean in group 0 mean in group 1 
      0.2004715       0.4944712

So we can say that the treatment is a significant predictor and try understanding the significance across covariants

data %>%
  group_by(treatment) %>%
  summarise_all(funs(mean(., na.rm = T)))

We can see there is a significant difference in mean of the etimators for the treatment so we run a t-test on the means.

They is significance between them since the means are not the same so we calculate the t-test for each covariate by treatment status

labels <- c('age','male','friend_cnt','avg_friend_age','avg_friend_male','friend_country_cnt','songsListened','lovedTracks','posts','playlists','shouts','tenure','good_country')

lapply(labels, function(v) {
  t.test(data[, v] ~ data$treatment)
 })

[[1]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = -20.841, df = 14645, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.778544 -1.472749
sample estimates:
mean in group 0 mean in group 1 
       23.74756        25.37321 


[[2]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = -1.3459, df = 15986, p-value = 0.1784
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.018236129  0.003388028
sample estimates:
mean in group 0 mean in group 1 
      0.6288378       0.6362618 


[[3]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = -33.707, df = 9903.1, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -46.12459 -41.05469
sample estimates:
mean in group 0 mean in group 1 
       10.43133        54.02097 


[[4]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = -27.658, df = 15667, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.744514 -1.513611
sample estimates:
mean in group 0 mean in group 1 
       23.76137        25.39043 


[[5]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = -7.7114, df = 23020, p-value = 0.00000000000001294
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.02846397 -0.01692672
sample estimates:
mean in group 0 mean in group 1 
      0.6131124       0.6358077 


[[6]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = -65.05, df = 10372, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -6.861271 -6.459857
sample estimates:
mean in group 0 mean in group 1 
       2.725062        9.385626 


[[7]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = -41.505, df = 11447, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -20037.04 -18229.80
sample estimates:
mean in group 0 mean in group 1 
       14602.22        33735.64 


[[8]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = -31.265, df = 10585, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -170.1918 -150.1102
sample estimates:
mean in group 0 mean in group 1 
       65.21365       225.36465 


[[9]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = -7.3649, df = 9933.6, p-value = 0.0000000000001914
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -22.76492 -13.19424
sample estimates:
mean in group 0 mean in group 1 
       2.543377       20.522956 


[[10]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = -10.492, df = 11238, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.2546958 -0.1745100
sample estimates:
mean in group 0 mean in group 1 
      0.5294671       0.7440700 


[[11]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = -11.426, df = 9888.1, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -100.04703  -70.74591
sample estimates:
mean in group 0 mean in group 1 
       16.42304       101.81951 


[[12]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = -14.696, df = 15805, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -3.792309 -2.899752
sample estimates:
mean in group 0 mean in group 1 
       43.20268        46.54871 


[[13]]

	Welch Two Sample t-test

data:  data[, v] by data$treatment
t = 2.0956, df = 16030, p-value = 0.03613
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.0007383591 0.0220968020
sample estimates:
mean in group 0 mean in group 1 
      0.3546936       0.3432760

The alternative hypothesis: true difference in means is not equal to 0 is the outcome for all the variables in the data in comparision with the adopter variable.

The difference-in-means is statistically significant at conventional levels of confidence as the p value is less than 0.05

Propensity score estimation¶

We estimate the propensity score by running a logit model (probit also works), where the outcome variable is a binary variable indicating treatment status.

It helps understand covariates(Independent Variables) we should include for the matching to give a causal estimate.In the end, you need to include any covariate that is related to both the treatment assignment and potential outcomes

model_pse <- glm(treatment ~  age+male+friend_cnt+avg_friend_age+avg_friend_male+friend_country_cnt+songsListened+lovedTracks+posts+playlists+shouts+tenure+good_country,
            family = binomial(), data = data)
summary(model_pse)

Warning message:
"glm.fit: fitted probabilities numerically 0 or 1 occurred"

Call:
glm(formula = treatment ~ age + male + friend_cnt + avg_friend_age + 
    avg_friend_male + friend_country_cnt + songsListened + lovedTracks + 
    posts + playlists + shouts + tenure + good_country, family = binomial(), 
    data = data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-4.4206  -0.5671  -0.4220  -0.3001   2.5619  

Coefficients:
                        Estimate    Std. Error z value             Pr(>|z|)    
(Intercept)        -5.1439677039  0.0770258529 -66.782 < 0.0000000000000002 ***
age                 0.0196976670  0.0028079947   7.015      0.0000000000023 ***
male                0.0431142354  0.0299809578   1.438             0.150419    
friend_cnt          0.0313218488  0.0010337026  30.301 < 0.0000000000000002 ***
avg_friend_age      0.0795507296  0.0034814999  22.850 < 0.0000000000000002 ***
avg_friend_male     0.2513881424  0.0502850056   4.999      0.0000005754883 ***
friend_country_cnt  0.1110348187  0.0047649448  23.302 < 0.0000000000000002 ***
songsListened       0.0000069064  0.0000005156  13.396 < 0.0000000000000002 ***
lovedTracks         0.0006670647  0.0000564517  11.817 < 0.0000000000000002 ***
posts               0.0005699077  0.0002682316   2.125             0.033613 *  
playlists           0.0056389408  0.0118975589   0.474             0.635530    
shouts             -0.0000490853  0.0000370677  -1.324             0.185434    
tenure             -0.0025711127  0.0007768961  -3.309             0.000935 ***
good_country        0.0320120020  0.0292175239   1.096             0.273235    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 46640  on 43826  degrees of freedom
Residual deviance: 34170  on 43813  degrees of freedom
AIC: 34198

Number of Fisher Scoring iterations: 7

vif(model_pse)

Using this model, we can now calculate the propensity score for each user. It is simply the user’s predicted probability of being Treated, given the estimates from the logit model. Below, I calculate this propensity score using predict() and create a dataframe that has the propensity score as well as the users actual treatment status.

propensity_score <- data.frame(pr_score = predict(model_pse, type = "response"),
                     treatment=model_pse$model$treatment)

We now examine the region of common support, so post estimating the propensity score, we plot histograms of the estimated propensity scores based on treatment status to study the distribution

labs <- paste("Treatment Type:", c("0 Subscriber friends", "1 or more Subscriber friends"))
propensity_score %>%
  mutate(treatment = ifelse(treatment == 0, labs[1], labs[2])) %>%
  ggplot(aes(x = pr_score)) +
  geom_histogram(color = "white",bins=55) +
  facet_wrap(~treatment) +
  xlab("Probability of having subscriber friends") +
  theme_bw()+xlim(0,1.01)

Warning message:
"Removed 4 rows containing missing values (geom_bar)."

We check if there are NA Values before the matching since the MatchIt doesn't allow NA Values.

sum(is.na(data))

Since the count of NA values is zero we proceed with the MatchIt function

mod_match <- matchit(treatment ~ age+male+friend_cnt+avg_friend_age+avg_friend_male+friend_country_cnt+songsListened+lovedTracks+posts+playlists+shouts+tenure+good_country,
                     method = "nearest", data = data)
summary(mod_match)

Warning message:
"glm.fit: fitted probabilities numerically 0 or 1 occurred"

Call:
matchit(formula = treatment ~ age + male + friend_cnt + avg_friend_age + 
    avg_friend_male + friend_country_cnt + songsListened + lovedTracks + 
    posts + playlists + shouts + tenure + good_country, data = data, 
    method = "nearest")

Summary of balance for all data:
                   Means Treated Means Control SD Control  Mean Diff    eQQ Med
distance                  0.4635        0.1550     0.1436     0.3086     0.2506
age                      25.3732       23.7476     6.2245     1.6256     1.0000
male                      0.6363        0.6288     0.4831     0.0074     0.0000
friend_cnt               54.0210       10.4313    15.2769    43.5896    22.0000
avg_friend_age           25.3904       23.7614     5.0577     1.6291     1.5909
avg_friend_male           0.6358        0.6131     0.3343     0.0227     0.0738
friend_country_cnt        9.3856        2.7251     3.1024     6.6606     5.0000
songsListened         33735.6404    14602.2205 23214.2898 19133.4199 15471.0000
lovedTracks             225.3647       65.2137   181.4812   160.1510    65.0000
posts                    20.5230        2.5434    33.7947    17.9796     0.0000
playlists                 0.7441        0.5295     0.9673     0.2146     0.0000
shouts                  101.8195       16.4230    79.7381    85.3965    15.0000
tenure                   46.5487       43.2027    19.7212     3.3460     3.0000
good_country              0.3433        0.3547     0.4784    -0.0114     0.0000
                     eQQ Mean     eQQ Max
distance               0.3086      0.6840
age                    1.6296      5.0000
male                   0.0074      1.0000
friend_cnt            43.5838   4794.0000
avg_friend_age         1.6369     11.5000
avg_friend_male        0.0958      0.3636
friend_country_cnt     6.6598     95.0000
songsListened      19126.1623 653702.0000
lovedTracks          159.9562   6343.0000
posts                 17.8829   9535.0000
playlists              0.2092     26.0000
shouts                85.1764  59168.0000
tenure                 3.3473     10.0000
good_country           0.0114      1.0000


Summary of balance for matched data:
                   Means Treated Means Control SD Control Mean Diff   eQQ Med
distance                  0.4635        0.3040     0.1913    0.1596    0.1077
age                      25.3732       26.3324     7.9056   -0.9592    1.0000
male                      0.6363        0.6576     0.4745   -0.0214    0.0000
friend_cnt               54.0210       21.4666    23.5251   32.5544   12.0000
avg_friend_age           25.3904       26.5572     6.7320   -1.1668    0.4376
avg_friend_male           0.6358        0.6551     0.2643   -0.0193    0.0158
friend_country_cnt        9.3856        5.0914     4.6473    4.2942    2.0000
songsListened         33735.6404    27360.8630 33892.7804 6374.7775 4680.0000
lovedTracks             225.3647      134.5440   299.1995   90.8206   38.0000
posts                    20.5230        6.2773    60.2598   14.2456    0.0000
playlists                 0.7441        0.6723     1.4015    0.0718    0.0000
shouts                  101.8195       37.2362   138.8781   64.5833   10.0000
tenure                   46.5487       47.7039    19.0357   -1.1551    1.0000
good_country              0.3433        0.3581     0.4795   -0.0149    0.0000
                    eQQ Mean     eQQ Max
distance              0.1596      0.4517
age                   0.9592      7.0000
male                  0.0214      1.0000
friend_cnt           32.5544   4794.0000
avg_friend_age        1.2763     14.0000
avg_friend_male       0.0326      0.1602
friend_country_cnt    4.2942     95.0000
songsListened      6374.7775 566867.0000
lovedTracks          90.8206   6180.0000
posts                14.2456   9535.0000
playlists             0.1035     22.0000
shouts               64.5833  59168.0000
tenure                1.2995      4.0000
good_country          0.0149      1.0000

Percent Balance Improvement:
                   Mean Diff. eQQ Med  eQQ Mean  eQQ Max
distance              48.2930 57.0083   48.2908  33.9658
age                   40.9972  0.0000   41.1419 -40.0000
male                -187.9614  0.0000 -187.6712   0.0000
friend_cnt            25.3162 45.4545   25.3062   0.0000
avg_friend_age        28.3760 72.4916   22.0309 -21.7391
avg_friend_male       14.7957 78.6165   65.9532  55.9466
friend_country_cnt    35.5279 60.0000   35.5203   0.0000
songsListened         66.6825 69.7499   66.6699  13.2836
lovedTracks           43.2906 41.5385   43.2216   2.5698
posts                 20.7676  0.0000   20.3394   0.0000
playlists             66.5567  0.0000   50.5109  15.3846
shouts                24.3724 33.3333   24.1770   0.0000
tenure                65.4771 66.6667   61.1782  60.0000
good_country         -30.1771  0.0000  -30.3571   0.0000

Sample sizes:
          Control Treated
All         34004    9823
Matched      9823    9823
Unmatched   24181       0
Discarded       0       0

Since there are 9823 records that have been treated[Subscriber Friend count is >= 1], the Matchit matches it with 9823 records from the Control that have the nearest Propensity score estimate.

So we now have 9823 treated and 9823 control records that have been matched based on Propensity score.

plot(mod_match)

We can see how post matching the QQ plots are normalised for the covariants where All on the left side denoted before Matching and Matched refers to the QQ Plot of the covariant after Propensity score matching.

We now create a dataframe with only the 9823 treatment records and 9823 matched Control records using the match.data function.

dta_m <- match.data(mod_match)
print("Dimensions of the matched data frame")
dim(dta_m)
print("Table of Treatment distriution in the matched data frame")
table(dta_m$treatment)

[1] "Dimensions of the matched data frame"

[1] "Table of Treatment distriution in the matched data frame"

   0    1 
9823 9823

Out of curiousity we want to study the distribution of Propensity score estimated distance across Control and Treatment. So we plot a boxplot for it.

dta_m$treat <- as.factor(dta_m$treatment)
abc2 <- ggplot(dta_m, aes(y=distance,x=treat)) + geom_boxplot(outlier.colour="red", outlier.shape=8,outlier.size=1) 
abc2 <- abc2 +theme(legend.position="none") 
abc2
dta_m$treat <- NULL

We can observe that there are covariates whose values are matched between treatment and control, so we do a visual inspection of it.

fn_bal <- function(dta, variable) {
  dta$variable <- dta[, variable]
  dta$treatment <- as.factor(dta$treatment)
  support <- c(min(dta$variable), max(dta$variable))
  ggplot(dta, aes(x = distance, y = variable, color = treatment)) +
    geom_point(alpha = 0.2, size = 1.3) +
    geom_smooth(method = "loess", se = F) +
    xlab("Propensity score") +
    ylab(variable) +
    theme_bw() +
    ylim(support)
}

grid.arrange(
  fn_bal(dta_m, "age"),
  fn_bal(dta_m, "male") + theme(legend.position = "none"),
  fn_bal(dta_m, "friend_cnt"),
  fn_bal(dta_m, "avg_friend_age") + theme(legend.position = "none"),
  fn_bal(dta_m, "avg_friend_male"),
  fn_bal(dta_m, "songsListened") + theme(legend.position = "none"),
  fn_bal(dta_m, "lovedTracks"),
  fn_bal(dta_m, "posts") + theme(legend.position = "none"),
  fn_bal(dta_m, "playlists"),  
  fn_bal(dta_m, "shouts") + theme(legend.position = "none"),
  fn_bal(dta_m, "tenure"),
  fn_bal(dta_m, "good_country") + theme(legend.position = "none"),
  nrow = 7)

Warning message:
"Removed 4 rows containing missing values (geom_smooth)."

We compare the means across the treatments post propensity score matching

group_by(dta_m,treatment) %>% 
  summarise_all(funs(mean(., na.rm = T)))

The means are in range for selected predictors and we now try finding if the difference of means is significant through a t-test for the covariants

labels <- c('age','male','friend_cnt','avg_friend_age','avg_friend_male','friend_country_cnt','songsListened','lovedTracks','posts','playlists','shouts','tenure','good_country')

lapply(labels, function(v) {
  t.test(dta_m[, v] ~ dta_m$treatment)
 })

[[1]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = 9.0201, df = 19340, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.750746 1.167609
sample estimates:
mean in group 0 mean in group 1 
       26.33238        25.37321 


[[2]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = 3.1356, df = 19640, p-value = 0.001718
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.008014461 0.034742334
sample estimates:
mean in group 0 mean in group 1 
      0.6576402       0.6362618 


[[3]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = -24.809, df = 10486, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -35.12657 -29.98225
sample estimates:
mean in group 0 mean in group 1 
       21.46656        54.02097 


[[4]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = 13.628, df = 18412, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.9989779 1.3346200
sample estimates:
mean in group 0 mean in group 1 
       26.55723        25.39043 


[[5]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = 5.4719, df = 19270, p-value = 0.00000004508
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.01241054 0.02626427
sample estimates:
mean in group 0 mean in group 1 
      0.6551451       0.6358077 


[[6]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = -38.564, df = 13868, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -4.512475 -4.075940
sample estimates:
mean in group 0 mean in group 1 
       5.091418        9.385626 


[[7]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = -11.383, df = 18452, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -7472.436 -5277.119
sample estimates:
mean in group 0 mean in group 1 
       27360.86        33735.64 


[[8]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = -15.488, df = 16091, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -102.31423  -79.32702
sample estimates:
mean in group 0 mean in group 1 
       134.5440        225.3647 


[[9]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = -5.6775, df = 11043, p-value = 0.00000001401
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -19.164006  -9.327289
sample estimates:
mean in group 0 mean in group 1 
       6.277308       20.522956 


[[10]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = -2.9528, df = 17787, p-value = 0.003154
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.1194130 -0.0241277
sample estimates:
mean in group 0 mean in group 1 
      0.6722997       0.7440700 


[[11]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = -8.5069, df = 10514, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -79.46491 -49.70174
sample estimates:
mean in group 0 mean in group 1 
       37.23618       101.81951 


[[12]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = 4.1551, df = 19604, p-value = 0.00003266
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.6102242 1.7000680
sample estimates:
mean in group 0 mean in group 1 
       47.70386        46.54871 


[[13]]

	Welch Two Sample t-test

data:  dta_m[, v] by dta_m$treatment
t = 2.183, df = 19642, p-value = 0.02905
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.001517727 0.028208426
sample estimates:
mean in group 0 mean in group 1 
      0.3581391       0.3432760

We now Estimate the treatment effects since we have the matched sample that we are happy with. We can use a t-test:

with(dta_m, t.test(adopter~ treatment))

	Welch Two Sample t-test

data:  adopter by treatment
t = -18.938, df = 18060, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.10009352 -0.08131745
sample estimates:
mean in group 0 mean in group 1 
     0.08683702      0.17754250

Machine Learning¶

Now, we will use a logistic regression approach to test which variables (including subscriber friends) are significant for explaining the likelihood of becoming an adopter. We use our judgment and visualization results to decide which variables to include in the regression. We also estimate the odds ratios for the key variables.

Model1:¶

We build a glm model with only the treatment variable to check if it is a significant predictor of the adopter variable.

glm_treat1 <- glm(adopter ~ treatment, data = dta_m,family='binomial')
summary(glm_treat1)

Call:
glm(formula = adopter ~ treatment, family = "binomial", data = dta_m)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.6252  -0.6252  -0.4262  -0.4262   2.2108  

Coefficients:
            Estimate Std. Error z value            Pr(>|z|)    
(Intercept) -2.35288    0.03583  -65.67 <0.0000000000000002 ***
treatment    0.81979    0.04451   18.42 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 15345  on 19645  degrees of freedom
Residual deviance: 14986  on 19644  degrees of freedom
AIC: 14990

Number of Fisher Scoring iterations: 5

exp(coef(glm_treat1))

The treatment [Having subscriber friends] is a significant predictor in judging if the user will be a free or a fee user. The odds ratio of the treatment is 2.27 which is greater than 1 and hence supports the case, also the probability is almost zero which makes it a good predictor

Model 2¶

We now build a glm model with the treatment variable and other covariates to predict the adopter status.

glm_treat2 <- glm(adopter ~ treatment+age+male+friend_cnt+avg_friend_age+avg_friend_male+friend_country_cnt+songsListened+lovedTracks+posts+playlists+shouts+tenure+good_country+treatment, data = dta_m,family='binomial')
summary(glm_treat2)

Call:
glm(formula = adopter ~ treatment + age + male + friend_cnt + 
    avg_friend_age + avg_friend_male + friend_country_cnt + songsListened + 
    lovedTracks + posts + playlists + shouts + tenure + good_country + 
    treatment, family = "binomial", data = dta_m)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.2240  -0.5668  -0.4562  -0.3697   2.5257  

Coefficients:
                        Estimate    Std. Error z value             Pr(>|z|)    
(Intercept)        -3.3709684810  0.1261268190 -26.727 < 0.0000000000000002 ***
treatment           0.7292753804  0.0468153008  15.578 < 0.0000000000000002 ***
age                 0.0141945749  0.0040691111   3.488             0.000486 ***
male                0.3039659743  0.0489909251   6.205 0.000000000548584009 ***
friend_cnt         -0.0002001575  0.0002792679  -0.717             0.473545    
avg_friend_age      0.0130425316  0.0053490465   2.438             0.014757 *  
avg_friend_male     0.0600688475  0.0925246717   0.649             0.516196    
friend_country_cnt  0.0072774704  0.0036486080   1.995             0.046088 *  
songsListened       0.0000042553  0.0000005301   8.027 0.000000000000000997 ***
lovedTracks         0.0005210828  0.0000469224  11.105 < 0.0000000000000002 ***
posts               0.0001186059  0.0000889116   1.334             0.182212    
playlists           0.0446485958  0.0119444028   3.738             0.000185 ***
shouts              0.0001119478  0.0000745439   1.502             0.133156    
tenure             -0.0024335424  0.0012170876  -1.999             0.045556 *  
good_country       -0.3694917638  0.0480180878  -7.695 0.000000000000014167 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 15345  on 19645  degrees of freedom
Residual deviance: 14493  on 19631  degrees of freedom
AIC: 14523

Number of Fisher Scoring iterations: 5

Predictor variables such as friend count, average friend being a male, posts and shouts don't have much significance in deciding if the user would be a free or a fee customer. Variables such as age, gender, having subscriber friends, songs listened, loved tracks, country from and playlists have significant impact on the adopter variable.

Model 3 - Enhancement of Model 2 with significant predictors¶

We build the third glm model with only the significant predictors from the glm_model_2

glm_treat3 <- glm(adopter ~ treatment+age+male+avg_friend_age+friend_country_cnt+songsListened+lovedTracks+playlists+tenure+good_country, data = dta_m)
summary(glm_treat3)

Call:
glm(formula = adopter ~ treatment + age + male + avg_friend_age + 
    friend_country_cnt + songsListened + lovedTracks + playlists + 
    tenure + good_country, data = dta_m)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.28859  -0.15589  -0.10624  -0.05655   0.99818  

Coefficients:
                         Estimate     Std. Error t value             Pr(>|t|)
(Intercept)        -0.03377255980  0.01265349611  -2.669             0.007613
treatment           0.07693526599  0.00490039844  15.700 < 0.0000000000000002
age                 0.00171597141  0.00045852615   3.742             0.000183
male                0.03029474232  0.00511504251   5.923  0.00000000322022844
avg_friend_age      0.00165086368  0.00056940879   2.899             0.003745
friend_country_cnt  0.00108730614  0.00032514409   3.344             0.000827
songsListened       0.00000063991  0.00000006592   9.708 < 0.0000000000000002
lovedTracks         0.00008598994  0.00000596592  14.414 < 0.0000000000000002
playlists           0.00719924354  0.00140642590   5.119  0.00000031035110903
tenure             -0.00028898123  0.00013195338  -2.190             0.028534
good_country       -0.03911872213  0.00501421899  -7.802  0.00000000000000642
                      
(Intercept)        ** 
treatment          ***
age                ***
male               ***
avg_friend_age     ** 
friend_country_cnt ***
songsListened      ***
lovedTracks        ***
playlists          ***
tenure             *  
good_country       ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.1091944)

    Null deviance: 2253.7  on 19645  degrees of freedom
Residual deviance: 2144.0  on 19635  degrees of freedom
AIC: 12257

Number of Fisher Scoring iterations: 2

We find all the predictors are significant in the model and hence calculate the odds ratio of the estimators below.

exp(coef(glm_treat3))

Model 4 - [Model 3 with original data]¶

We build a glm model with the same significant predictors from glm_model_3 but with the original dataset which has not been matched by propensity score, to try finding if the odds ratio are similar.

glm_treat4 <- glm(adopter ~ treatment+age+male+avg_friend_age+friend_country_cnt+songsListened+lovedTracks+playlists+tenure+good_country, data = data)
summary(glm_treat4)

Call:
glm(formula = adopter ~ treatment + age + male + avg_friend_age + 
    friend_country_cnt + songsListened + lovedTracks + playlists + 
    tenure + good_country, data = data)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.39105  -0.09437  -0.05231  -0.02488   1.00860  

Coefficients:
                         Estimate     Std. Error t value             Pr(>|t|)
(Intercept)        -0.04683542149  0.00642124147  -7.294    0.000000000000306
treatment           0.08132195744  0.00346437697  23.474 < 0.0000000000000002
age                 0.00196724618  0.00027577639   7.133    0.000000000000994
male                0.02433625105  0.00267685896   9.091 < 0.0000000000000002
avg_friend_age      0.00166388874  0.00034601544   4.809    0.000001524090572
friend_country_cnt  0.00126832808  0.00024434666   5.191    0.000000210447684
songsListened       0.00000071282  0.00000004673  15.252 < 0.0000000000000002
lovedTracks         0.00009302361  0.00000460858  20.185 < 0.0000000000000002
playlists           0.00834912229  0.00101197896   8.250 < 0.0000000000000002
tenure             -0.00040899303  0.00007013351  -5.832    0.000000005527125
good_country       -0.02528691859  0.00265970867  -9.507 < 0.0000000000000002
                      
(Intercept)        ***
treatment          ***
age                ***
male               ***
avg_friend_age     ***
friend_country_cnt ***
songsListened      ***
lovedTracks        ***
playlists          ***
tenure             ***
good_country       ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.06890903)

    Null deviance: 3243.2  on 43826  degrees of freedom
Residual deviance: 3019.3  on 43816  degrees of freedom
AIC: 7152.8

Number of Fisher Scoring iterations: 2

All the predictors are significant and we hence find the odds ratio.

exp(coef(glm_treat4))

We can find that the odds ratio is similar for the glm model built with the propensity score matched data and the given dataset.

Our conclusion from the result would be that, the odds ratio of the user converting to a paid subscriber of the music app increases if the user has friends who are subscribers as well. Also it was found that people who are older tend to have sunscriptions because of the affordability factor and males have a higher probability of converting to paid users.

User engagement helps increase the chances of the customer paying for the services as the engagement variables songs listened, loved tracks, playlists have positive odds ratio coeffecients. Tenure however has an odds ratio less than 1 which we discuss in the implications part of Q5. Average friend age also has a postive odds ratio similar to the age predictor.

Business Takeaways¶

Key take aways from our analysis and the suggestions we would give High Note for a “free-to-fee” strategy would be the following. We find that demographics, peer influence, user engagement all have good effect on the adopter variable and hence formulate the below findings and suggestions to the company.

Having subscriber friends has a positive effect in increasing the odds of becoming a fee customer. So we want to offer group packages and referral schemes to get more users excited about the paid subscription.

As the tenure increases the likelihood of converting to a paid customer reduces, so we want to target the user with ads more during the initial phase of usage.

Songs listened, Loved tracks and playlists all have positive odds ratio and hence to capitalise on it we want to increase the number of songs offered, maybe expand with regional songs to improve user engagement.

If the user is from a good country the odds ratio of him converting is lesser because there are more competitive players in the market and we want to personalise the app more in such countries.

Friend Country count has a positive effect on the odds ratio so we can suggest people from other countries as friends in the recommendation to enhace chances.

	vars	n	mean	sd	min	max	range	se
age	1	3527	25.9798696	6.8435969	8	73	65	0.115234280
male	2	3527	0.7292316	0.4444197	0	1	1	0.007483255
friend_cnt	3	3527	39.7337681	117.2748963	1	5089	5088	1.974705464
subscriber_friend_cnt	4	3527	1.6368018	5.8499808	0	287	287	0.098503511
songsListened	5	3527	33758.0405444	43592.7275397	0	817290	817290	734.025780107
playlists	6	3527	0.9007655	2.5633919	0	118	118	0.043163065
posts	7	3527	21.2004536	221.9934445	0	8506	8506	3.737983845
shouts	8	3527	99.4397505	1156.0731310	0	65872	65872	19.466262602
lovedTracks	9	3527	264.3407995	491.4267571	0	10220	10220	8.274772631
friend_country_cnt	10	3527	7.1888290	8.8598183	0	136	136	0.149183945

	vars	n	mean	sd	min	max	range	se
age	1	40300	23.9484367	6.3718313	8	79	71	0.031740353
male	2	40300	0.6218610	0.4849286	0	1	1	0.002415601
friend_cnt	3	40300	18.4916625	57.4811697	1	4957	4956	0.286334101
subscriber_friend_cnt	4	40300	0.4174690	2.4181507	0	309	309	0.012045667
songsListened	5	40300	17589.4415136	28416.0229266	0	1000000	1000000	141.550292596
playlists	6	40300	0.5492804	1.0719556	0	98	98	0.005339791
posts	7	40300	5.2930025	104.3094298	0	12309	12309	0.519602280
shouts	8	40300	29.9726551	150.6897894	0	7736	7736	0.750639308
lovedTracks	9	40300	86.8226303	263.5804416	0	12522	12522	1.312987702
friend_country_cnt	10	40300	3.9578908	5.7641675	0	129	129	0.028713364

treatment	age	male	friend_cnt	avg_friend_age	avg_friend_male	friend_country_cnt	subscriber_friend_cnt	songsListened	lovedTracks	posts	playlists	shouts	adopter	tenure	good_country
0	23.74756	0.6288378	10.43133	23.76137	0.6131124	2.725062	0.000000	14602.22	65.21365	2.543377	0.5294671	16.42304	0.05243501	43.20268	0.3546936
1	25.37321	0.6362618	54.02097	25.39043	0.6358077	9.385626	2.300417	33735.64	225.36465	20.522956	0.7440700	101.81951	0.17754250	46.54871	0.3432760

treatment	age	male	friend_cnt	avg_friend_age	avg_friend_male	friend_country_cnt	subscriber_friend_cnt	songsListened	lovedTracks	posts	playlists	shouts	adopter	tenure	good_country	distance	weights
0	26.33238	0.6576402	21.46656	26.55723	0.6551451	5.091418	0.000000	27360.86	134.5440	6.277308	0.6722997	37.23618	0.08683702	47.70386	0.3581391	0.3039891	1
1	25.37321	0.6362618	54.02097	25.39043	0.6358077	9.385626	2.300417	33735.64	225.3647	20.522956	0.7440700	101.81951	0.17754250	46.54871	0.3432760	0.4635421	1