import numpy as np
from matplotlib import pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import power_transform

import statsmodels.api as sm
import pandas as pd

# Data Generation
np.random.seed(400)
x = np.random.uniform(10,50,1000)
y = np.random.normal(x+40,np.abs(x+10)/2)

# Fitting of Generated Data
X = sm.add_constant(x)
model = LinearRegression().fit(X,y)
y_fit = model.predict(X)
resid = y-y_fit

# TRANSFORMATIONS
xy = pd.DataFrame({'x':x,'y':y})
xy_trans = power_transform(xy,method='box-cox')
x_t = xy_trans[:,0]
y_t = xy_trans[:,1]

# Fitting of Transformed Data
X_t = sm.add_constant(x_t)
model_t = LinearRegression().fit(X_t,y_t)
y_t_fit = model_t.predict(X_t)
resid_t = y_t-y_t_fit

# Plotting
def plot_data(x,y,y_fit,title,xlabel,ylabel):
    plt.plot(x,y,'b.',alpha=0.5,label='data')
    plt.plot(x,y_fit,'r-',label='fit')
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend()

plt.figure(figsize=(8,6))
plt.subplot(2,2,1)
plot_data(x,y,y_fit,'Heteroscadastic Data','x','y')
plt.subplot(2,2,2)
plot_data(x_t,y_t,y_t_fit,'Transformed Data','x_t','y_t')
plt.subplot(2,2,3)
plot_data(x,resid,np.zeros(len(x)),'Heteroscadastic Data Residuals','x','resid')
plt.subplot(2,2,4)
plot_data(x_t,resid_t,np.zeros(len(x_t)),'Transformed Data Residuals','x_t','resid')
plt.tight_layout()
plt.show()

import ipywidgets as widgets
from IPython.display import display

x_value = widgets.FloatSlider(description='x_value',value=2.6,min=-5,max=5)
x_method = widgets.Dropdown(options=['none','log','power'],description='x_method',value='power')
y_value = widgets.FloatSlider(description='y_value',value=0.1,min=-5,max=5)
y_method = widgets.Dropdown(options=['none','log','power'],description='y_method',value='power')

def transform_and_plot(x_method, x_value, y_method, y_value):
    x_t = x
    y_t = y
    if x_method == 'log':
        x_t = np.log(x)
    elif x_method == 'power':
        x_t = x**x_value
    elif x_method == 'none':
        x_t = x
    if y_method == 'log':
            y_t = np.log(y)
    elif y_method == 'power':
        y_t = y**y_value
    elif y_method =='none':
        y_t = y
    X_t = sm.add_constant(x_t)
    
    model = LinearRegression().fit(X_t,y_t)
    y_fit = model.predict(X_t)
    resid = y_t-y_fit
    resid_rms = np.sqrt(np.mean(resid**2))

    plt.figure(figsize=(12,8))
    plt.subplot(2,3,1)
    plt.hist(x_t,bins=30)
    plt.xlabel('x_t')

    plt.subplot(2,3,2)
    plt.plot(x_t, resid,'b.',alpha=0.5)
    plt.hlines(0,min(x_t),max(x_t),color='red')
    plt.hlines(resid_rms,min(x_t),max(x_t),color='red',linestyle='--',label='resid rms')
    plt.hlines(-resid_rms,min(x_t),max(x_t),color='red',linestyle='--')
    plt.xlabel('x_t')
    plt.ylabel('resid')
    plt.legend()

    plt.subplot(2,3,3)
    plt.hist(resid,bins=30,orientation='horizontal')
    plt.ylabel('resid')

    plt.subplot(2,3,4)
    plt.plot(x_t,y_t,'b.',alpha=0.5)
    plt.plot(x_t,y_fit,'r-')
    plt.xlabel('x_t')
    plt.ylabel('y_t')

    plt.subplot(2,3,5)
    plt.hist(y_t,bins=30,orientation='horizontal')
    plt.ylabel('y_t')

    plt.tight_layout()
    plt.show()
        
out = widgets.interactive_output(transform_and_plot,
                                 {'x_value': x_value,'x_method': x_method,'y_value':y_value,'y_method':y_method})
out.layout.height = '800px'
widgets.VBox([widgets.HBox([widgets.VBox([x_method,x_value]),widgets.VBox([y_method,y_value])]), out])

Calvin Z He

Transforms for Regression¶

Author: Calvin He¶