sderosiaux · June 27, 2020 08:09
diff --git a/0-spark.sh b/0-spark.sh
 spark-submit --master yarn --deploy-mode cluster # yarn cluster mode (driver in yarn)
    --conf spark.yarn.maxAppAttempts=4 # default 2
    --conf spark.yarn.am.attemptFailuresValidityInterval=1h # reset the count every hour (a streaming app can last months)
    --conf spark.yarn.max.executor.failures={8 * num_executors} # default is max(2*num_executors, 3). So for 4 executors: 32 against 8.
    --conf spark.yarn.executor.failuresValidityInterval=1h # same as before, but for the executors
    --conf spark.task.maxFailures=8 # default is 4
    --queue realtime_queue # do not mess with the default queue
    --conf spark.speculation=true # ensure the job is idempotent (it will start the same job twice if the first is slow)
    --conf spark.driver.extraJavaOptions=-Dlog4j.configuration=file:log4j.properties # eh, logging to some logstash for instance
    --conf spark.executor.extraJavaOptions=-Dlog4j.configuration=file:log4j.properties
    --files /path/to/log4j.properties:/path/to/metrics.properties
diff --git a/1-metrics.properties b/1-metrics.properties
 *.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink
 *.sink.graphite.host=[hostname]
 *.sink.graphite.port=[port]
 *.sink.graphite.prefix=some_meaningful_name

 driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
 executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource

 # special function to parse Spark metrics in graphite:
 # for driver: aliasSub(stats.analytics.$job_name.*.prod.$dc.*.driver.jvm.heap.used, ".*(application_[0-9]+).*", "heap: \1")
 # for execuors: aliasSub(groupByNode(stats.analytics.$job_name.*.prod.$dc.*.[0-9]*.jvm.heap.used, 6, "sumSeries"), "(.*)", "heap: \1")
	spark-submit --master yarn --deploy-mode cluster # yarn cluster mode (driver in yarn)
	--conf spark.yarn.maxAppAttempts=4 # default 2
	--conf spark.yarn.am.attemptFailuresValidityInterval=1h # reset the count every hour (a streaming app can last months)
	--conf spark.yarn.max.executor.failures={8 * num_executors} # default is max(2*num_executors, 3). So for 4 executors: 32 against 8.
	--conf spark.yarn.executor.failuresValidityInterval=1h # same as before, but for the executors
	--conf spark.task.maxFailures=8 # default is 4
	--queue realtime_queue # do not mess with the default queue
	--conf spark.speculation=true # ensure the job is idempotent (it will start the same job twice if the first is slow)
	--conf spark.driver.extraJavaOptions=-Dlog4j.configuration=file:log4j.properties # eh, logging to some logstash for instance
	--conf spark.executor.extraJavaOptions=-Dlog4j.configuration=file:log4j.properties
	--files /path/to/log4j.properties:/path/to/metrics.properties
	*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink
	*.sink.graphite.host=[hostname]
	*.sink.graphite.port=[port]
	*.sink.graphite.prefix=some_meaningful_name

	driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
	executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource

	# special function to parse Spark metrics in graphite:
	# for driver: aliasSub(stats.analytics.$job_name..prod.$dc..driver.jvm.heap.used, ".(application_[0-9]+).", "heap: \1")
	# for execuors: aliasSub(groupByNode(stats.analytics.$job_name..prod.$dc..[0-9].jvm.heap.used, 6, "sumSeries"), "(.)", "heap: \1")