## Reading in data
path = '/databricks-datasets/wine-quality/winequality-white.csv'
wine_df = (spark.read
.option('header', 'true')
.option('inferSchema', 'true')
.option('sep', ';')
.csv(path))
wine_df_clean = wine_df.select([F.col(col).alias(col.replace(' ', '_')) for col in wine_df.columns])
display(wine_df_clean)
## Split the data into training and test sets. (0.75, 0.25) split.
seed = 1111
train, test = train_test_split(wine_df_delta, train_size=0.75, random_state=seed)
## The target column is "quality" which is a scalar from [3, 9]
X_train = train.drop(['quality'], axis=1)
X_test = test.drop(['quality'], axis=1)
y_train = train[['quality']]
y_test = test[['quality']]
Reproducible Anything: Machine Learning Meets Lakehouse
You can run this notebook using the following cluster config: