Reports

# Import numpy since it was missing earlier

import numpy as np

# Re-run the log transformation and regression

# Convert 'hourpay' to numeric, forcing errors to NaN

df['hourpay'] = pd.to_numeric(df['hourpay'], errors='coerce')

# Step 1 (revised): Drop missing, zero or negative wages

df = df[df['hourpay'] > 0]

# Create log(wage)

df['log_wage'] = np.log(df['hourpay'])

# Step 2: Motherhood dummy: 1 if has dependent child under 19, 0 otherwise

df['motherhood'] = df['fdpch19'].apply(lambda x: 1 if x > 0 else 0)

# Step 3: Convert categorical variables

df['education'] = df['degcls7'].astype('category')

df['occupation'] = df['occup_group'].astype('category')

df['worktype'] = df['ftpt'].astype('category')

# Step 4: Experience approximation (proxy by age)

df['experience'] = df['age']

# Step 5: Regression formula

formula = 'log_wage ~ motherhood + C(education) + experience + C(occupation) + C(worktype)'

# Step 6: Run OLS regression with robust standard errors

model = smf.ols(formula, data=df).fit(cov_type='HC1')

# Display regression results

model.summary()

79511832