Created
September 17, 2019 13:52
-
-
Save sllynn/8338fef98a5dcff3497c26b12f367c1c to your computer and use it in GitHub Desktop.
melt a pyspark dataframe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pyspark.sql.functions import array, col, explode, lit, struct | |
| from pyspark.sql import DataFrame | |
| from typing import Iterable | |
| def melt( | |
| df: DataFrame, | |
| id_vars: Iterable[str], value_vars: Iterable[str], | |
| var_name: str="variable", value_name: str="value") -> DataFrame: | |
| """Convert :class:`DataFrame` from wide to long format.""" | |
| # Create array<struct<variable: str, value: ...>> | |
| _vars_and_vals = array(*( | |
| struct(lit(c).alias(var_name), col(c).alias(value_name)) | |
| for c in value_vars)) | |
| # Add to the DataFrame and explode | |
| _tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals)) | |
| cols = id_vars + [ | |
| col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]] | |
| return _tmp.select(*cols) | |
| sdf = spark.createDataFrame(pdf) | |
| melt(sdf, id_vars=['A'], value_vars=['B', 'C']).show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment