Interpolate Missing Values with Linear Regression

Here’s a quick little function to interpolate missing values in a pandas dataframe with linear regression. Enjoy.

from sklearn import linear_model
import pandas as pd
import numpy as np

def interpolate_regression(df, columnx, columny):
    
    x = np.array(df[df[columny].notna()][columnx].values)
    y = np.array(df[df[columny].notna()][columny].values)

    regr = linear_model.LinearRegression()
    regr.fit(x.reshape(len(x),1), y.reshape(len(y),1))

    z = np.array(train_data[train_data.LotFrontage.isnull()].LotArea.values)
    yhats = regr.predict(z.reshape(len(z),1))
    df[columny][df[columny].isnull()] = yhats
    return df

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: