Skip to content

Instantly share code, notes, and snippets.

@sllynn
Created September 17, 2019 13:52
Show Gist options
  • Select an option

  • Save sllynn/8338fef98a5dcff3497c26b12f367c1c to your computer and use it in GitHub Desktop.

Select an option

Save sllynn/8338fef98a5dcff3497c26b12f367c1c to your computer and use it in GitHub Desktop.
melt a pyspark dataframe
from pyspark.sql.functions import array, col, explode, lit, struct
from pyspark.sql import DataFrame
from typing import Iterable
def melt(
df: DataFrame,
id_vars: Iterable[str], value_vars: Iterable[str],
var_name: str="variable", value_name: str="value") -> DataFrame:
"""Convert :class:`DataFrame` from wide to long format."""
# Create array<struct<variable: str, value: ...>>
_vars_and_vals = array(*(
struct(lit(c).alias(var_name), col(c).alias(value_name))
for c in value_vars))
# Add to the DataFrame and explode
_tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
cols = id_vars + [
col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
return _tmp.select(*cols)
sdf = spark.createDataFrame(pdf)
melt(sdf, id_vars=['A'], value_vars=['B', 'C']).show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment