# Takes in user's housing preferences (minimum square feet, minimum bedrooms, minimum Bathrooms, & minimum year built), filters the dataset, visualizes it using a scatterplot and uses a linear regression ML model as a predictor for housing prices v. square feet. Both the actual, and predicted data are visualized in a single plot, as well as an accuracy metric for the predictor (R-squared).
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np
def main():
# wrangling the data
df = pd.read_csv('housing_price_dataset.csv')
df = pd.get_dummies(df, columns=['Neighborhood'])
# collecting user preferences
print("Hello User! Thank you for using our Housing Price Predictor!")
print()
print("Enter your house preferences below: ")
neighborhood = input("Ideal Neighborhood (Urban, Suburb, Rural)? ")
while (neighborhood not in ['Urban', 'Suburb', 'Rural']):
print("Invalid neighborhood. Please enter your ideal neighborhood again:")
neighborhood = input("Ideal Neighborhood (Urban, Suburb, Rural)? ")
min_sqft = int(input("Minimum Square Feet? "))
min_beds = int(input("Minimum Bedrooms? "))
min_baths = int(input("Minimum Bathrooms? "))
min_year = int(input("Minimum Year Built? "))
# filtering data based on user preferences
dff = df[df['SquareFeet'] >= min_sqft]
dff = dff[dff['Bedrooms'] >= min_beds]
dff = dff[dff['Bathrooms'] >= min_baths]
dff = dff[dff['YearBuilt'] >= min_year]
if (neighborhood == 'Rural'):
dff = dff[dff['Neighborhood_Rural'] == 1]
elif (neighborhood == 'Suburb'):
dff = dff[dff['Neighborhood_Suburb'] == 1]
elif (neighborhood == 'Urban'):
dff = dff[dff['Neighborhood_Urban'] == 1]
if (dff.empty):
print("No matches")
return
# building ML predictions
X = dff[['SquareFeet', 'Bedrooms', 'Bathrooms', 'YearBuilt', 'Neighborhood_Rural', 'Neighborhood_Suburb', 'Neighborhood_Urban']]
y = dff['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
X_test["y_pred"] = y_pred
X_test = X_test.sort_values("SquareFeet")
X["y"] = y
X = X.sort_values("SquareFeet")
# visualize predictions
fig, ax = plt.subplots(1, 1, figsize = (12, 7))
ax.scatter(X['SquareFeet'], X['y'], color = "darkblue", alpha = 0.4, label = "Actual Values", marker = '.')
ax.plot(X_test['SquareFeet'], X_test["y_pred"].rolling(window = 30).mean(), color = "darkorange", label = "Predicted Values", linewidth = 3)
fig.suptitle("Actual & Predicted Prices by Square Feet Based On Housing Criteria")
ax.set(xlabel = "Square Feet", ylabel = "Price ($)")
ax.legend(loc = "upper right")
fig.tight_layout()
fig.savefig("housing.png")
# output regression accuracy score r squared
print()
print("R-Squared (Model Accuracy):", round(r2_score(y_test, y_pred), 2))
if __name__ == '__main__':
main()

Rachana Kadikar
Senior at the University of Southern California passionate about building creative and data-backed solutions. Check out my projects here, and please reach out via email or LinkedIn!