Skip to content

Instantly share code, notes, and snippets.

View Hugheym's full-sized avatar
🎷
Coffee

Hugheym

🎷
Coffee
View GitHub Profile
%python
import mapboxgl
import matplotlib.pyplot as plt
import matplotlib
from pylab import *
max_height = 100
height_scale = 1
quantiles = df[paramToVisualize].quantile(np.linspace(0,1, 25)).values
cutoffs = [quantiles[0]]
@Hugheym
Hugheym / spark_filtering_nan_and_infinity.scala
Created April 28, 2020 02:35
To filter Nan and Infinity values out of DataFrame.
df.filter(!$"colName".isin(Double.NaN, Double.PositiveInfinity, Double.NegativeInfinity))
@Hugheym
Hugheym / dataframe_to_geojson.py
Last active April 27, 2020 12:13
Converting pandas DataFrame with lat, lon coordinate lists to GeoJSON Features.
%python
import numpy as np
paramToVisualize = "numPickups"
def to_hex_geojson(dataLat, dataLon):
coords = list(zip(dataLon, dataLat))
# Because the geojson polygon requires the last and first element to match.
coords.append(coords[0])
return {"type": "Polygon",
"coordinates": [coords]}
hexify = np.vectorize(to_hex_geojson, otypes=[object])
%python
dbutils.library.installPyPI("mapboxgl")
import numpy as np
import pandas as pd
import mapboxgl
from geojson.feature import Feature, FeatureCollection
df = spark.table("h3PickupStatsWithHex").toPandas()
#data gets pulled to driver node memory when we conver to pandas.
df.head(4)
val h3PickupStatsWithHex = h3PickupStats
.withColumn("pickupH3Lats", h3ToGeoLatBoundary(col("h3_pickup")))
.withColumn("pickupH3Lons", h3ToGeoLonBoundary(col("h3_pickup")))
h3PickupStatsWithHex.createOrReplaceTempView("h3PickupStatsWithHex")
import org.apache.spark.sql.functions._
val h3PickupStats = spark.table("ny_taxi_sample")
.withColumn("h3_pickup", geoToH3(col("pickup_latitude"), col("pickup_longitude"), lit(11))) // create h3 index
.groupBy("h3_pickup").agg(
count("*").alias("numPickups"),
sum("passenger_count").alias("totalPassangerCount"),
avg("tip_amount").alias("avg_tip")
)
import com.uber.h3core.H3Core
import scala.collection.JavaConversions._
import scala.collection.JavaConverters._
object H3 extends Serializable {
val instance = H3Core.newInstance()
}
// Return the H3
val geoToH3 = udf{ (latitude: Double, longitude: Double, resolution: Int) =>
%python
for colName in results_df.columns:
if not colName.endswith("datetime") and colName not in ("payment_type", "vendor_id", "store_and_fwd_flag"):
results_df[colName] = results_df[colName].astype(float)
sparkDf = spark.createDataFrame(results_df.infer_objects())
sparkDf.write.mode("overwrite").saveAsTable("ny_taxi_sample")
%python
# Use dbutils to install sodapy, and mapboxgl from the python package index (we can also do this in the cluster settings page)
dbutils.library.installPyPI("sodapy") #sodapy is a client for Socrata Open Data API.
dbutils.library.installPyPI("mapboxgl") #mapboxgl is used for rendering interactive maps.
import pandas as pd
from sodapy import Socrata
# Unauthenticated client only works with public data sets.
client = Socrata("data.cityofnewyork.us", None)
SELECT objectid, COALESCE(AVG(angle), 0) tortuosity FROM
(
SELECT objectid,
ST_ANGLE(ST_MAKELINE(lag(point,2) over w, lag(point,1) over w), ST_MAKELINE((lag(point, 1) over w), point)) angle
FROM
( SELECT objectid, (points).geom point, (points).path[2] path FROM
(SELECT
objectid, ST_DumpPoints(shape) points FROM (SELECT * FROM map.road_link) t1
) t2
) t3