4. Regression Coefficientes#

# Import modules
import pandas as pd, seaborn as sns, numpy as np, matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm
import math, warnings
import patsy
warnings.filterwarnings('ignore')
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 8
      6 import patsy
      7 warnings.filterwarnings('ignore')
----> 8 from sklearn import preprocessing
      9 label_encoder = preprocessing.LabelEncoder()

ModuleNotFoundError: No module named 'sklearn'

4.1. With Table#

data1 = pd.read_stata("https://github.com/worldbank/r-econ-visual-library/raw/master/Library/Data/METable.dta")
data1.head()

for i in ['refer', 'med_class_any_6', 'med_class_any_16']:
    label_encoder.fit(data1[i])
    data1[i] = label_encoder.transform(data1[i])
    data1[i] = np.where(data1[i] == 2, np.nan, data1[i])

data1.head()
study facilitycode case as_correct as_h1 as_h2 as_h6 as_h7 as_h8 as_h12 ... kenya_fac_qual_code_2 kenya_fac_qual_code_3 price_kenya facility_private checklist_essential sp_roster_age sp_roster_bmi sp_roster_bp sp_roster_bp_sys sp_roster_male
0 Kenya 9052302 Child Diarrhea NaN NaN NaN NaN NaN NaN NaN ... No No 230.0 Private 0.500 31 32.038578 130/80 130 Female
1 Kenya 9012207 Child Diarrhea NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 350.0 Private 0.625 31 32.038578 130/80 130 Female
2 Kenya 9052301 Child Diarrhea NaN NaN NaN NaN NaN NaN NaN ... No No 400.0 Private 0.125 31 32.038578 130/80 130 Female
3 Kenya 9022203 Child Diarrhea NaN NaN NaN NaN NaN NaN NaN ... No No 250.0 Private 0.125 31 32.038578 130/80 130 Female
4 Kenya 9012206 Child Diarrhea NaN NaN NaN NaN NaN NaN NaN ... No No 300.0 Private 0.125 31 32.038578 130/80 130 Female

5 rows × 84 columns

# plt.figure(figsize=(8, 12))
y = ["as_correct", "ch_correct", "cp_correct", "tb_correct"
     , "refer"
     , "med_any"
     ,"med_class_any_6", "med_class_any_16"
    ]
y_label_t = [
    "Asthma: Inhaler/Bronchodilator"
    , "Child Diarrhea: ORS"
    , "Chest Pain: Referencial/Aspirin/ECG"
    , "Tuberculosis: AFB Smear"
    , "Referred (non-diarrhea)"
    , "Any Medication"
    , "Antibiotics"
    , "Steroids"
]

fmla = " ~ facility_private + C(case_code)"
fmla1 = " ~ facility_private"

coef = []
lower = []
upper = []
p_v = []
varname = []


for i in y:
    try:
        # mdl = smf.glm(i + fmla, data = data1).fit()
        mdl = smf.logit(i + fmla, data = data1).fit();
    except:
        mdl = smf.logit(i + fmla1, data = data1).fit();
        # mdl = smf.glm(i + fmla1, data = data1).fit()
        # print(i + fmla1)
        
    coef_data = pd.DataFrame(mdl.summary().tables[1].data)
    coef_val = np.float_(np.array(coef_data.iloc[2, [1, 4, 5, 6]]))
    
    coef.append(coef_val[0])
    p_v.append(coef_val[1])
    lower.append(coef_val[2])
    upper.append(coef_val[3])
    varname.append(i)
                                  
        
df0 = pd.DataFrame(
    {
        "coef": coef,
        "p_value": p_v,
        "l": lower,
        "u": upper,
        "var": varname,
    }
)
df1 = pd.melt(df0,id_vars=['var', 'coef', "p_value"], var_name='range', value_name='range_val').round(3)


OR = df0.coef.map(math.exp)
new_or = []
for i in OR:
    vl = str(round(i, 2))
    new_or.append(vl)
Optimization terminated successfully.
         Current function value: 0.486007
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.486007
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.574092
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.574092
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.308251
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.308251
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.607698
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.607698
         Iterations 5
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.155485
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.187942
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.531125
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.671791
         Iterations 4
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.072542
         Iterations: 35
### Plots    

OR = df0.coef.map(math.exp)
new_or = []
for i in OR:
    new_vl = round(float(i), 2)
    new_or.append(new_vl)
    
df0["OR"] = new_or
table_reg = (" " + df0["OR"].map(str) + "  -  " +  "[" + df0.l.map(str) + " , " + df0.u.map(str) + "]" + "  -  " + df0.p_value.map(str))[::-1]

fig = plt.figure(figsize = (8, 6))
ax = fig.add_axes([.1, 1, 1, 1])

for i in y[::-1]:
    ref_data = df1[df1["var"] == i]
    ax.plot("range_val", "var", data = ref_data, color = "black")
    ax.scatter("coef", "var", data = ref_data, color = "black")
    
    
omit_all = ['left', 'right', 'top', 'bottom']

ax.spines[omit_all].set_visible(False)

ax.axvline(0, linestyle = "-.", lw = .5, color = "black")

for i in range(8):
    lbl = table_reg.iloc[i]
    ax.text(3.35, i, lbl, size=12);

ax.text(3.35, 7.5, "OR         95% CI           P-Value", size = 13);

ax.set_xticks(np.arange(-4, 4.1, 2))
ax.set_xlabel(r"$\leftarrow$ Favors Public                         Favors Privte $\rightarrow$")
ax.set_yticklabels(y_label_t[::-1])
plt.show();
../_images/231b9ada6741613be0c75d43dbd8e7fd8598ede9c7c45f234d36f2020decdc9e.png

4.2. With Tables of Two Datasets#

data1 = pd.read_stata("https://github.com/d2cml-ai/python_visual_library/raw/main/data/METable2data.dta")
data2 = pd.read_stata("https://github.com/d2cml-ai/python_visual_library/raw/main/data/METable2data2.dta")
data1['case_3'] = np.where(data1.case == "Case 3", 1, 0)
y_vars = [
    "correct", 
    "treat_cxr", 
    "re_3", 
    "re_4", 
    "med_any", 
    "med_l_any_2", 
    "med_l_any_3", 
    "med_k_any_9"
]
for j in [data1, data2]:
    for i in y_vars:
        # print(i)
        label_encoder.fit(j[i])
        j[i] = label_encoder.transform(j[i])
coef0 = []
l0 = []
u0 = []
y_pos = []
y_label = []
p_val = []
y_0 = len(y_vars) 

fmla = '~ case_3  + C(city) + C(type_formal)'
for i in y_vars:
    while y_0 > 0:
        y_pos.append(y_0)
        y_0 = y_0 - 1
    y1, x_1 = patsy.dmatrices(i + fmla, data = data1, return_type='dataframe')
    logit_glm = sm.GLM(y1, x_1, var_weights=data1['weight_city'], family = sm.families.Binomial()).fit(cov_type='HC1').summary().tables[1]
    # print(logit_glm)
    logit_arr = np.float_(pd.DataFrame(logit_glm.data).iloc[4, [1, 2, 4]])
    c = np.exp(logit_arr[0])
    coef0.append(c)
    l0.append(c - 1.96 * logit_arr[1])
    u0.append(c + 1.96 * logit_arr[1])
    p_val.append(logit_arr[2])
    # print(logit_df)
    y_label.append(i)

df1 = pd.DataFrame({"point": coef0, "l" : l0, 'u': u0, "label" : y_label, "y_pos": y_pos, 'pval': p_val})
df11 = pd.melt(df1, id_vars=['point', 'label', 'y_pos', "pval"], value_vars=['l', 'u'])
fmla2 = '~ sp4_spur_1'
coef = []
l = []
u = []
y_pos = []
y_label = []
y_0 = len(y_vars) 
p_val = []

for i in y_vars:
    while y_0 > 0:
        y_pos.append(y_0)
        y_0 = y_0 - 1
    
    mdl2 = smf.logit(i + fmla2, data = data2).fit().summary().tables[1]
    # print(mdl2)
    coefs = np.float_(pd.DataFrame(mdl2.data).iloc[2, [1, 2, 4]])
    c = coefs[0]
    coef.append(np.exp(c))
    l.append(np.exp(c) - 1.96 * coefs[1])
    u.append(np.exp(c) + 1.96 * coefs[1])
    y_label.append(i)
    p_val.append(coefs[2])

df2 = pd.DataFrame({"point": coef, "l" : l, 'u': u, "label" : y_label, "y_pos": y_pos, "pval": p_val})
df21 = pd.melt(df2, id_vars=['point', 'label', 'y_pos', "pval"], value_vars=['l', 'u'])
Optimization terminated successfully.
         Current function value: 0.648644
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.161501
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.635826
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.638312
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.495496
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.320903
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.649982
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.097266
         Iterations 8
df1 = df1.round(2)
df2 = df2.round(2)
# df1.iloc[0, :]
fig, ax = plt.subplots(2, 1, facecolor = "white", figsize = (7, 10))
fig.subplots_adjust(hspace = .5)

max_r = 7

for i in y_vars:
    ref_data1 = df11[df11.label == i]
    ax[0].plot('value', 'y_pos', data = ref_data1, c = "black")
ax[0].scatter("point", "y_pos", data = df11, c = "black")

for i in y_vars:
    ref_data2 = df21[df21.label == i]
    ax[1].plot('value', 'y_pos', data = ref_data2, c = "black")
ax[1].scatter("point", "y_pos", data = df21, c = "black")

omit = ['top', 'bottom', 'right', 'left']
ylabels = [
    'Steroids', 'Other Antibiotic', 'Fluoroquinolone', 'Any Medicine', 'Xpert MTB/RIF', 'Sputum AFB', 
    'Chest-X-Ray', 'Correct Management'
]
           

for i in range(2):
    ax[i].spines[omit].set_visible(False)
    ax[i].set_xlim(1 - max_r, 1 + max_r)
    ax[i].axvline(1, c = 'black', lw = 1, alpha = .5)
    ax[i].set_yticks(range(1, len(df1) + 1))
    ax[i].set_yticklabels(ylabels)
    ax[i].set_xticks(np.arange(-6, 8, 3.5))
    ax[i].text(max_r + 1, len(df1) + .5 , "OR") # OR
    ax[i].text(max_r + 3.5, len(df1) + .5, "95%CI") # CI
    ax[i].text(max_r + 5.5, len(df1) + .5, "P-value") # CI

for i in range(len(df1)):
    ref_t1 = np.array(df1.iloc[i, :])
    ax[0].text(max_r + 1, ref_t1[4], ref_t1[0]) # OR
    ax[0].text(max_r + 3, ref_t1[4], f"[{ref_t1[1]} , {ref_t1[2]}]", ) # CI
    ax[0].text(max_r + 6, ref_t1[4], ref_t1[5]) # P-value
    
    ax[0].set_xlabel(r"$\leftarrow$ Favors Ordinary (N = 407)    Favors Privte  (N = 352)$\rightarrow$")
    ax[0].set_title("A. Case 1 vs Case 3 in all providers receiving both cases\n", size = 15)
for i in range(len(df2)):
    ref_t1 = np.array(df2.iloc[i, :])
    ax[1].text(max_r + 1, ref_t1[4], ref_t1[0]) # OR
    ax[1].text(max_r + 3, ref_t1[4], f"[{ref_t1[1]} , {ref_t1[2]}]", ) # CI
    ax[1].text(max_r + 6, ref_t1[4], ref_t1[5]) # P-value
    ax[1].set_xlabel(r"$\leftarrow$ Favors Ordinary (N = 50)    Favors Privte  (N = 51)$\rightarrow$")
    ax[1].set_title("B. SP4 with and without sputum report in Mumbai MBBS+\n", size = 15)

# ax[0].set_xlim(0, 10)
../_images/a626611343a2291d99d9d760826c88be3133449cdfac40844b687241a626b402.png

4.3. With Graded error bars#

from linearmodels.panel import PanelOLS as fe

data3 = pd.read_stata("https://github.com/worldbank/r-econ-visual-library/raw/master/Library/Data/ReplicationDataGhanaJDE_short.dta")
data3.head(3)

data30 = data3[(data3.wave >= 2) & (data3.cashtreat != 1)]

def f1(df, a, b):
    cond = (df.a == 3) & (df.b == 1)
    return cond


data30_she = data30.groupby('sheno')
data30['treatment'] = data30_she['wave'].transform(lambda x: x == 3 ) & data30_she['timetreat'].transform(lambda x: x == 1 )
data30['control'] = data30_she['control'].transform(lambda x: all(x))
data30['after'] = data30_she['wave'].transform(lambda x: np.where(x >=3, True, False))


features = [
    "realfinalprofit", 
    "expend_health_3months", 
    "expend_education_3months", 
    "expend_total_3months",
    "wave", 
    "equiptreat",
    "after",
    "sheno"
]
data31 = data30[(data30['treatment'] == True) | (data30.control == True)][features]
data31 = data31.set_index(['sheno', 'wave'])
data31.head()
realfinalprofit expend_health_3months expend_education_3months expend_total_3months equiptreat after
sheno wave
110101604 5 351.575378 5.0 0.0 45.0 0 True
3 205.722168 0.0 0.0 32.0 0 True
6 402.129395 23.0 0.0 73.0 0 True
4 322.331146 0.0 0.0 25.0 0 True
2 228.246490 0.0 0.0 30.0 0 False
y = [ 
    "realfinalprofit", 
     "expend_health_3months", 
    "expend_education_3months", 
    "expend_total_3months"
]

fmla = " ~ equiptreat * after - 1"

c, Ci90, Ci95 = [], [], []
y_pos = []
y_lbl = []
y_n = 1
for i in y:
    y_lbl.append([i, i])
    while y_n <= len(y):
        y_pos.append([y_n, y_n])
        y_n = y_n + 1
    y_data, x_data = patsy.dmatrices(i + fmla, data = data31, return_type='dataframe')
    x_data1 = x_data['equiptreat:after[T.True]']
    y_data, x_data1
    mdl = fe(y_data, exog = x_data1, time_effects=True, drop_absorbed = True).fit( cov_type='clustered'
            , cluster_entity = True )
    ci95 = np.array(mdl.conf_int().iloc[0, [0, 1]])
    ci90 = np.array(mdl.conf_int(.9).iloc[0, [0, 1]])
    Ci90.append(ci90)
    Ci95.append(ci95)
    c.append([mdl.params[0], mdl.params[0]])

c = np.concatenate(c)    
Ci90 = np.concatenate(Ci90)
Ci95 = np.concatenate(Ci95)    
y_pos = np.concatenate(y_pos)
y_lbl = np.concatenate(y_lbl)
df3 = pd.DataFrame(
    {
       "point": c, 
       "ci90": Ci90, 
       "ci95": Ci95, 
       "y_pos": y_pos 
    }
)
point ci90 ci95 y_pos
0 -9.476182 -30.082561 -34.033920 1
1 -9.476182 11.130197 15.081556 1
2 -9.610347 -23.957912 -26.709284 2
3 -9.610347 4.737218 7.488590 2
4 18.580187 -9.496230 -14.880299 3
5 18.580187 46.656605 52.040673 3
6 19.035654 -15.885621 -22.581464 4
7 19.035654 53.956929 60.652772 4
fig = plt.figure(facecolor='white', figsize=(7, 5))
ax = fig.add_axes([.1, 1, 1, 1])
lws = [6, 3]
colors = ["#4b6189", "#478abc"]
for i in range(4):
    ref_data = df3[df3.y_pos == i]
    plt.scatter("point", "y_pos", data = ref_data, color = "black", label = "", s = 100)
    plt.plot("ci90", 'y_pos', data = ref_data, color = colors[0], lw = lws[0], alpha = .7, label = "")
    plt.plot("ci95", 'y_pos', data = ref_data, color = colors[1], lw = lws[1], alpha = .7, label = "")

ref_data = df3[df3.y_pos == 4]
ax.scatter("point", "y_pos", data = ref_data, color = "black", label = "", s = 100)
ax.plot("ci90", 'y_pos', data = ref_data, color = colors[0], lw = lws[0], alpha = .7, label = "90%")
ax.plot("ci95", 'y_pos', data = ref_data, color = colors[1], lw = lws[1], alpha = .7, label = "95%")

omit = ['top', 'bottom', 'left', 'right']
y_labels = ['3mo Edu. Exp. (cedi)', '3mo Health Exp', '3mo Total Exp.', '3mo Real Profit (cedi)']
ax.legend(title = "Confidence level", ncol = 2)
ax.axvline(0, color = "black", lw = 1, linestyle = "--")
ax.set_yticks(range(1, 5))
ax.set_xticks(np.arange(-30, 61, 30))
ax.set_yticklabels(y_labels)
ax.set_ylabel("Point estimates & 95% CI", size = 14)
ax.spines[omit].set_visible(False)
../_images/113f10de3cffbbdb24c7a2959439e30c739350220c19a8229d0cb8e9d55e913e.png

4.4. Marginal Effect#

data4 = pd.read_stata("https://github.com/worldbank/r-econ-visual-library/raw/master/Library/Data/RegCoefME.dta")
data4.head()
study facilitycode case as_correct as_h1 as_h2 as_h6 as_h7 as_h8 as_h12 ... kenya_fac_qual_code_2 kenya_fac_qual_code_3 price_kenya facility_private checklist_essential sp_roster_age sp_roster_bmi sp_roster_bp sp_roster_bp_sys sp_roster_male
0 Kenya 9052302 Child Diarrhea NaN NaN NaN NaN NaN NaN NaN ... No No 230.0 Private 0.500 31 32.038578 130/80 130 Female
1 Kenya 9012207 Child Diarrhea NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 350.0 Private 0.625 31 32.038578 130/80 130 Female
2 Kenya 9052301 Child Diarrhea NaN NaN NaN NaN NaN NaN NaN ... No No 400.0 Private 0.125 31 32.038578 130/80 130 Female
3 Kenya 9022203 Child Diarrhea NaN NaN NaN NaN NaN NaN NaN ... No No 250.0 Private 0.125 31 32.038578 130/80 130 Female
4 Kenya 9012206 Child Diarrhea NaN NaN NaN NaN NaN NaN NaN ... No No 300.0 Private 0.125 31 32.038578 130/80 130 Female

5 rows × 84 columns

y4 = [
   'as_correct', 'ch_correct', 'cp_correct', 'tb_correct', 
    # "refer"
    "med_any", "med_class_any_6", "med_class_any_16"]

for i in y4:
    label_encoder.fit(data4[i])
    data4[i] = label_encoder.transform(data4[i])
    
    data4[i] = np.where(data4[i] == 2, np.nan, data4[i])
    # print(set(data4[i]))
md1 = " ~ facility_private + C(case_code)"
md2 = " ~ facility_private"


ci95 = [] # CI
c = [] # cofficiente
y_n = [] # y posicion
y_name = [] # y label
y_label = []

id_m = 0 # axis 0

main_row = 2

correct_array = ['as_correct', 'ch_correct', 'cp_correct', 'tb_correct']

for i in y4:
    
    while id_m < len(y4): ## Y variables
        y_n.append([id_m + 1, id_m + 1, id_m + 1, id_m + 1])
        id_m = id_m + 1
    ### ols
    
    if i in correct_array:
        mdl_ols = smf.ols(i + md2, data = data4).fit()
    else:
        mdl_ols = smf.ols(i + md1, data = data4).fit()
        # print(i)
        # print(pd.DataFrame(mld_ols.summary().tables[1].data))
    
    coef_ols = np.float_(pd.DataFrame(mdl_ols.summary().tables[1].data).iloc[main_row, [1, 2]])
    
    point = coef_ols[0]
    std = coef_ols[1]
    
    y_label.append([i, i])
    y_name.append(["ols", "ols"])
    c.append([point, point])
    ci95.append([point + 1.96 * std, point - 1.96 * std])
    
    ### GLM
    if i in correct_array:
        mdl_glm = smf.logit(i + md2, data = data4).fit()
    else:
        mdl_glm = smf.logit(i + md2, data = data4).fit()
    
    coef_glm = np.float_(pd.DataFrame(mdl_glm.get_margeff().summary().tables[1].data).iloc[main_row - 1 , [1, 2]])
    
    point_glm = coef_glm[0]
    std_glm = coef_glm[1]
    
    y_label.append([i, i])
    y_name.append(["glm", "glm"])
    c.append([point_glm, point_glm])
    ci95.append([point_glm + 1.96 * std_glm, point_glm - 1.96 * std_glm])
Optimization terminated successfully.
         Current function value: 0.486007
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.574092
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.308251
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.607698
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597636
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.692627
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.086124
         Iterations 9
df41 = pd.DataFrame(
    {
        "modelo": np.concatenate(y_name), 
        "ylbel": np.concatenate(y_label),
        "y_pos": np.concatenate(y_n),
        "coef": np.concatenate(c), 
        "ci": np.concatenate(ci95)
    }
)
fig = plt.figure()
ax = fig.add_axes([.1, 1, 1, 1])

mdls = ['ols', 'glm']
y_val = np.unique(df41.y_pos)
separate = .2
colors = ['#4392bc', '#af370f']

for i in mdls:
    for j in y_val:
        ref_data = df41[(df41.y_pos == j) & (df41.modelo == i)]
        
        if i == "ols":
            ref_data['y_pos'] = 8 - ref_data['y_pos'] + separate 
            cl = colors[0]
        elif i == "glm":
            cl = colors[1]
            ref_data['y_pos'] = 8- ref_data['y_pos'] - separate 
            
        ax.plot("ci", "y_pos", data = ref_data, c = cl, label = "")
        ax.scatter("coef", "y_pos", data = ref_data, c = cl, label = "")
        # ax.text("coef", "y_pos", "ylbel", data = ref_data)

for i in mdls:
    ref_data = df41[(df41.y_pos == 7) & (df41.modelo == i)]
    if i == "ols":
        ref_data['y_pos'] = 8 - ref_data['y_pos'] + separate 
        cl = colors[0]
        lbl = "Linear Model"
    elif i == "glm":
        cl = colors[1]
        ref_data['y_pos'] = 8 - ref_data['y_pos'] - separate 
        lbl = "Logistic Model"
            
    ax.plot("ci", "y_pos", data = ref_data, c = cl, label = lbl)
    ax.scatter("coef", "y_pos", data = ref_data, c = cl, label = "")
            
y_label_t = [
    "Asthma: Inhaler/Bronchodilator"
    , "Child Diarrhea: ORS"
    , "Chest Pain: Referencial/Aspirin/ECG"
    , "Tuberculosis: AFB Smear"
    # , "Referred (non-diarrhea)"
    , "Any Medication"
    , "Antibiotics"
    , "Steroids"
]
omit = ['top', 'right', 'left', 'bottom']
ax.spines[omit].set_visible(False)      
ax.set_xlabel(r"$\leftarrow$ Favors Public                       Favors Privte $\rightarrow$")
ax.set_xlim(-1, 1)
ax.set_xticks(np.arange(-1, 1.1, .5))
ax.set_yticks(np.arange(1, 7.1, 1))
ax.set_yticklabels(y_label_t[::-1])
ax.legend(ncol = 2, loc = (.24, -.2))
ax.axvline(0, linestyle = "--", lw = 1.5, color = "#cccccc")
plt.show();
../_images/22028955a283fddf06c7c46828254d70c4d5dd3e9fc9714d1e50a7401f6af25e.png

4.5. Multiple Outcomes#

data5 = pd.read_csv("https://github.com/d2cml-ai/python_visual_library/raw/reg-coef/data/GhanaJDE.csv")
features = [
     "expend_health_3months", 
    "expend_education_3months", 
    "expend_total_3months",
    'realfinalprofit', 'cashtreat', 'equiptreat', 'after', 'wave', 'sheno']

data5 = data5[features]
data5_inx = data5.set_index(['sheno', 'wave'])
data5_inx


data5 = pd.read_csv("https://github.com/d2cml-ai/python_visual_library/raw/reg-coef/data/GhanaJDE.csv")
features = [
     "expend_health_3months", 
    "expend_education_3months", 
    "expend_total_3months",
    'realfinalprofit', 'cashtreat', 'equiptreat', 'after', 'wave', 'sheno']

data5 = data5[features]
data5_inx = data5.set_index(['sheno', 'wave'])
data5_inx.head()
expend_health_3months expend_education_3months expend_total_3months realfinalprofit cashtreat equiptreat after
sheno wave
110101604 5 5.0 0.0 45.0 351.575378 0 0 True
3 0.0 0.0 32.0 205.722168 0 0 True
6 23.0 0.0 73.0 402.129395 0 0 True
4 0.0 0.0 25.0 322.331146 0 0 True
2 0.0 0.0 30.0 228.246490 0 0 False
y = [ 
    "expend_education_3months", 
     "expend_health_3months", 
    "expend_total_3months",
    "realfinalprofit"
]

y_label = []
ci = []
c = []
y_pos = []
y_lbl = []
y_ticks_lbl = []

y_n = 1

for i in y:
    while y_n <= len(y):
        y_pos.append(np.repeat(y_n, 4))
        y_n = y_n + 1
    #----
    flma = i + " ~ (cashtreat + equiptreat) * after - 1"
    y_data, x_data = patsy.dmatrices(flma, data5_inx, return_type='dataframe')
    x_data1 = x_data.filter(['cashtreat:after[T.True]',
           'equiptreat:after[T.True]'])
    #----
    mdl = fe(y_data, exog = x_data1, time_effects=True, entity_effects=True).fit(cov_type='clustered'
            , cluster_entity = True )
    #---
    
    y_ll = ['cash', 'cash', 'kind', 'kind']
    y_lbl.append(y_ll)
    
    y_ticks_lbl.append(np.repeat(i, 4))
    
    point_cash = mdl.params[0]
    point_kind = mdl.params[1]
    
    cash_ci = np.array(mdl.conf_int().iloc[0, [0, 1]])
    kind_ci = np.array(mdl.conf_int().iloc[1, [0, 1]])
    
    c.append([point_cash, point_cash, point_kind, point_kind])
    ci.append([cash_ci, kind_ci])
    
    

y_pos = np.concatenate(y_pos)
c = np.concatenate(c)
ci = np.concatenate(np.concatenate(ci))
y_lbl = np.concatenate(y_lbl)
lbls_y = np.concatenate(y_ticks_lbl)

df5 = pd.DataFrame(
    {
       "y_pos": y_pos, 
       "point": c, 
       "ci": ci, 
       "type": y_lbl, 
        "y": lbls_y
    }
)
d_iff, alphha = .13, .5
df5['y_pos1'] = np.where(df5['type'] == "cash", df5['y_pos'] - d_iff, df5['y_pos'] + d_iff)
cls = ["#c96e1e", '#169930']

fig = plt.figure(facecolor='white', figsize=(4, 5))
ax = fig.add_axes([.1, 1, 1, 1])


for i in y:
    for j in ['cash', 'kind']:
        if j == 'cash':
            cl = cls[1]
        else:
            cl = cls[0]
            
        ref_data = df5[(df5['type'] == j) & (df5['y'] == i)]
        ax.scatter("point", "y_pos1", data = ref_data, color = cl, label = "")
        ax.plot("ci", "y_pos1", data = ref_data, color = cl, label = "", alpha = alphha)

for j in ['cash', 'kind']:
    if j == 'cash':
        cl = cls[1]
        lbl = "Cash"
    else:
        cl = cls[0]
        lbl = "In-kind"

    ref_data = df5[(df5['type'] == j) & (df5['y'] == "realfinalprofit")]
    ax.scatter("point", "y_pos1", data = ref_data, color = cl, label = "")
    ax.plot("ci", "y_pos1", data = ref_data, color = cl, label = lbl, alpha = alphha)        
        
y_labels = ['3mo Edu. Exp. (cedi)', '3mo Health Exp', '3mo Total Exp.', '3mo Real Profit (cedi)']
omit = ['top', 'bottom', 'left', 'right']
        
ax.legend(title = "Treatment", loc = (1, .5))
ax.axvline(0, lw = 1, linestyle = "--", color = "black")

ax.set_ylabel("Point Estimates & 95% CI", size = 15)
ax.set_yticks(range(1, 5))
ax.set_xticks(np.arange(-100, 101, 50))
ax.set_yticklabels(y_labels, size = 12)
ax.spines[omit].set_visible(False)
../_images/c13a8d0a2097d0eab6d552b5daa7e63274e9adbe25c9f29ba46c4553506bbf9e.png

4.6. Point Estimates by Rounds#

data6 = pd.read_csv("https://github.com/d2cml-ai/python_visual_library/raw/reg-coef/data/GhanaJDE.csv")
features = ['realfinalprofit', 'cashtreat', 'equiptreat', 'after', 'wave', 'sheno']

data6 = data6[features]
data6['wave1'] = data6['wave'].astype('category')
data6_inx = data6.set_index(['sheno', 'wave'])
data6_inx.head()

flma = "realfinalprofit ~ (cashtreat + equiptreat) * wave1 -1"
y_data, x_data = patsy.dmatrices(flma, data6_inx, return_type='dataframe')

x_data1 = x_data.filter(regex=r"cashtreat:|equiptreat:")

mdl = fe(y_data, exog = x_data1, time_effects=True, entity_effects=True).\
fit(cov_type='clustered', cluster_entity = True )
c = []
ci = []
x_p = mdl.params
x_ci = mdl.conf_int()
for i in range(0, len(x_p)):
    
    c.append([x_p[i], x_p[i]])
    ci.append(np.array(x_ci.iloc[i, [0, 1]]))
    
x_n = np.arange(1, 5, 1)

c = np.concatenate(c)
ci = np.concatenate(ci)
lbl = np.concatenate([np.repeat("cash", 8), np.repeat("equi", 8)])
x_pos = np.concatenate([np.repeat(x_n, 2), np.repeat(x_n, 2)])

df6 = pd.DataFrame(
    {
        "point": c,
        "ci" : ci, 
        "treat" : lbl, 
        "x" :  x_pos 
    }
)
sep = .15
df6['x_pos'] = np.where(df6.treat == 'cash', df6.x - sep, df6.x + sep)
df6.head()
point ci treat x x_pos
0 0.396657 -96.518288 cash 1 0.85
1 0.396657 97.311602 cash 1 0.85
2 -25.720863 -150.319739 cash 2 1.85
3 -25.720863 98.878013 cash 2 1.85
4 -16.036545 -124.272709 cash 3 2.85
fig = plt.figure(figsize=(8, 5))
ax = fig.add_axes([.1, 1, 1, 1])
cls = ["#169930", '#c96e1e']
omit = ['right', 'left', 'top', 'bottom']
alpha = 1
for i in ['cash', 'equi']:
    for j in x_n:
        ref_data = df6[(df6.x == j) & (df6.treat == i)]
        if i == 'cash':
            cl = cls[0]
        else:
            cl = cls[1]
        plt.plot("x_pos", 'ci', data = ref_data, c = cl, label = "")
        plt.scatter('x_pos', 'point', data = ref_data, c = cl, label = "")
        
for i in ['cash', 'equi']:
    ref_data = df6[(df6.x == j) & (df6.treat == 1)]
    if i == 'cash':
        cl = cls[0]
        lbl = "Cash"
    else:
        cl = cls[1]
        lbl = "Equipment"
    plt.plot("x_pos", 'ci', data = ref_data, c = cl, label = lbl)
    plt.scatter('x_pos', 'point', data = ref_data, c = cl, label = "")
        
x_ti_labels = ['Wave 3', 'Wave 4', 'Wave 5', "Wave 6"] 
ax.legend(title = "Treatment Group", loc = (1, .4), )
ax.axhline(0, c = 'black', lw = 1, linestyle = "--")

ax.spines[omit].set_visible(False)
ax.set_yticks(np.arange(-150, 151, 50))
ax.set_xticks(np.arange(1, 5, 1))
ax.set_xticklabels(x_ti_labels, size = 14)
ax.set_ylabel("Point Estimates & 95% CI", size = 15)
plt.show();
Text(0, 0.5, 'Point Estimates & 95% CI')
../_images/3746c506ca7b3f50499af9eccc6332b62cab00976121e4ef38e3365e6787f6b1.png