# Import numpy since it was missing earlier
import numpy as np
# Re-run the log transformation and regression
# Convert 'hourpay' to numeric, forcing errors to NaN
df['hourpay'] = pd.to_numeric(df['hourpay'], errors='coerce')
# Step 1 (revised): Drop missing, zero or negative wages
df = df[df['hourpay'] > 0]
# Create log(wage)
df['log_wage'] = np.log(df['hourpay'])
# Step 2: Motherhood dummy: 1 if has dependent child under 19, 0 otherwise
df['motherhood'] = df['fdpch19'].apply(lambda x: 1 if x > 0 else 0)
# Step 3: Convert categorical variables
df['education'] = df['degcls7'].astype('category')
df['occupation'] = df['occup_group'].astype('category')
df['worktype'] = df['ftpt'].astype('category')
# Step 4: Experience approximation (proxy by age)
df['experience'] = df['age']
# Step 5: Regression formula
formula = 'log_wage ~ motherhood + C(education) + experience + C(occupation) + C(worktype)'
# Step 6: Run OLS regression with robust standard errors
model = smf.ols(formula, data=df).fit(cov_type='HC1')
# Display regression results
model.summary()