evenv · January 3, 2026 21:22
diff --git a/Dataframe functions.markdown b/Dataframe functions.markdown
diff --git a/Spark Dataframe Tricks.py b/Spark Dataframe Tricks.py
 # misc import statements
 from pyspark.sql import SQLContext
 from pyspark.sql.types import *
 from pyspark.sql.functions import *
 from pprint import pprint as pp

 ## creating dataframes

 df = sqlContext.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"]) # from manual data
 df = sqlContext.read.format('com.databricks.spark.csv') \
  .options(delimiter=';',header='true', inferschema='true',mode="FAILFAST") \
  .load('csv_file_name_or_*_reference')

 # adding columns and keeping existing ones
 df.withColumn('zero', F.lit(0))
 df.withColumn('A_times_two', df.A * 2)

 # selecting columns, and creating new ones
 df.select(
    'A'
  , 'B'
  , col('A').alias('new_name_for_A') # col is nice for referring to columns for alias etc without having to repeat the dataframe name
  , ( col('B') > 0 ).alias('is_B_greater_than_zero')
  , unix_timestamp('A','dd.MM.yyyy HH:mm:ss').alias('A_in_unix_time') # convert to unix time from text
  )

 # filtering
 df.filter('A_in_unix_time > 946684800')

 # pivoting
 unpivoted.groupBy('A','B').pivot('C').agg(first('D')).orderBy(['A','B']) # first could be any aggregate function

 # inspecting dataframes
 display(df) # table in notebook at least
 df.show() # text table

diff --git a/Timestamp manipulation - interval and casting.py b/Timestamp manipulation - interval and casting.py
 # Create a simple DataFrame.
 data = [
  ("2015-01-01 23:59:59", "2015-01-02 00:01:02", 1),
  ("2015-01-02 23:00:00", "2015-01-02 23:59:59", 2),
  ("2015-01-02 22:59:58", "2015-01-02 23:59:59", 3)]
 df = sqlContext.createDataFrame(data, ["start_time", "end_time", "id"])
 df = df.select(
  df.start_time.cast("timestamp").alias("start_time"),
  df.end_time.cast("timestamp").alias("end_time"),
  df.id)
 
 # Get all records that have a start_time and end_time in the
 # same day, and the difference between the end_time and start_time
 # is less or equal to 1 hour.
 condition = \
  (to_date(df.start_time) == to_date(df.end_time)) & \
  (df.start_time + expr("INTERVAL 1 HOUR") >= df.end_time)
 
 df.filter(condition).show()
	# misc import statements
	from pyspark.sql import SQLContext
	from pyspark.sql.types import *
	from pyspark.sql.functions import *
	from pprint import pprint as pp

	## creating dataframes

	df = sqlContext.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"]) # from manual data
	df = sqlContext.read.format('com.databricks.spark.csv') \
	.options(delimiter=';',header='true', inferschema='true',mode="FAILFAST") \
	.load('csv_file_name_or_*_reference')

	# adding columns and keeping existing ones
	df.withColumn('zero', F.lit(0))
	df.withColumn('A_times_two', df.A * 2)

	# selecting columns, and creating new ones
	df.select(
	'A'
	, 'B'
	, col('A').alias('new_name_for_A') # col is nice for referring to columns for alias etc without having to repeat the dataframe name
	, ( col('B') > 0 ).alias('is_B_greater_than_zero')
	, unix_timestamp('A','dd.MM.yyyy HH:mm:ss').alias('A_in_unix_time') # convert to unix time from text
	)

	# filtering
	df.filter('A_in_unix_time > 946684800')

	# pivoting
	unpivoted.groupBy('A','B').pivot('C').agg(first('D')).orderBy(['A','B']) # first could be any aggregate function

	# inspecting dataframes
	display(df) # table in notebook at least
	df.show() # text table
	# Create a simple DataFrame.
	data = [
	("2015-01-01 23:59:59", "2015-01-02 00:01:02", 1),
	("2015-01-02 23:00:00", "2015-01-02 23:59:59", 2),
	("2015-01-02 22:59:58", "2015-01-02 23:59:59", 3)]
	df = sqlContext.createDataFrame(data, ["start_time", "end_time", "id"])
	df = df.select(
	df.start_time.cast("timestamp").alias("start_time"),
	df.end_time.cast("timestamp").alias("end_time"),
	df.id)

	# Get all records that have a start_time and end_time in the
	# same day, and the difference between the end_time and start_time
	# is less or equal to 1 hour.
	condition = \
	(to_date(df.start_time) == to_date(df.end_time)) & \
	(df.start_time + expr("INTERVAL 1 HOUR") >= df.end_time)

	df.filter(condition).show()