ZygD solution doesnt work since month is zero indexed. here a simple fix to it.
from pyspark.sql.functions import lpad, concat, lit, regexp_replace, col
def convert_gregorian_datetime_to_date(df, col_name):
gregorian_regex = '(?:.*)YEAR=(\d+).+?MONTH=(\d+).+?DAY_OF_MONTH=(\d+).+?HOUR=(\d+).+?MINUTE=(\d+).+?SECOND=(\d+).+'
return (df.withColumn("year_tmp", regexp_replace(col(col_name), gregorian_regex, '$1'))
.withColumn("month_tmp", regexp_replace(col(col_name), gregorian_regex, '$2').cast('integer')+1) # month is zero indexed
.withColumn("month_tmp", lpad(col("month_tmp").cast("string"), 2, "0"))
.withColumn("day_tmp", regexp_replace(col(col_name), gregorian_regex, '$3'))
.withColumn(col_name, concat(col("year_tmp"), lit("-"), col("month_tmp"), lit("-"), col("day_tmp")).cast('date'))
.drop("year_tmp", "month_tmp", "day_tmp"))
df = spark.createDataFrame(
[('java.util.GregorianCalendar[time=?,areFieldsSet=false,areAllFieldsSet=false,lenient=true,zone=sun.util.calendar.ZoneInfo[id="Etc/UTC",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null],firstDayOfWeek=1,minimalDaysInFirstWeek=1,ERA=?,YEAR=2017,MONTH=7,WEEK_OF_YEAR=?,WEEK_OF_MONTH=?,DAY_OF_MONTH=18,DAY_OF_YEAR=?,DAY_OF_WEEK=?,DAY_OF_WEEK_IN_MONTH=?,AM_PM=0,HOUR=9,HOUR_OF_DAY=9,MINUTE=1,SECOND=52,MILLISECOND=0,ZONE_OFFSET=?,DST_OFFSET=?]',)],
['GregorianCalendar']
)
df.transform(convert_gregorian_datetime_to_date,'GregorianCalendar').show()
+-----------------+
|GregorianCalendar|
+-----------------+
| 2017-08-18|
+-----------------+