Skip to content

Commit 3b6e160

Browse files
committed
Use SparkSession.builder for basic example, replace deprecated registerFunction
1 parent 6aad7d0 commit 3b6e160

1 file changed

Lines changed: 9 additions & 11 deletions

File tree

spark/container.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,34 @@
1-
from pyspark import SparkContext
2-
from pyspark.sql import SQLContext, Row
1+
from pyspark.sql import SparkSession
32

4-
sc = SparkContext('local', 'barcos')
5-
sq = SQLContext(sc)
3+
spark = SparkSession.builder.master("local").appName("container").getOrCreate()
64

7-
df = sq.read.load('data/containers_tiny.parquet')
5+
df = spark.read.load('data/containers_tiny.parquet')
86
df.printSchema()
97

108
# Using API
119
df.select("ship_imo", "ship_name", "country").filter(df['country'] == 'DK').show()
1210

1311
# Register table alias to allow SQL use
1412
df.createOrReplaceTempView("container")
15-
sq.sql("SELECT ship_imo, ship_name FROM container WHERE country = 'DK'").show()
13+
spark.sql("SELECT ship_imo, ship_name FROM container WHERE country = 'DK'").show()
1614

1715
# ship_imo, num of containers, total ship weight
18-
total_weight_rdd = sq.sql("SELECT ship_imo, count(container_id) number, sum(net_weight) total_weight FROM container GROUP BY ship_imo")
16+
total_weight_rdd = spark.sql("SELECT ship_imo, count(container_id) number, sum(net_weight) total_weight FROM container GROUP BY ship_imo")
1917
total_weight_rdd.printSchema()
2018
total_weight_rdd.show()
2119
# print total_weight_rdd.map(lambda r: r['number']).collect()
2220

2321
# UDFs
24-
sq.registerFunction('en_toneladas', lambda c: float(c) / 1000.0)
25-
sq.sql("SELECT en_toneladas(net_weight) toneladas, net_weight FROM container WHERE container_id = 'FMBV1684747'").show()
22+
spark.udf.register('en_toneladas', lambda c: float(c) / 1000.0)
23+
spark.sql("SELECT en_toneladas(net_weight) toneladas, net_weight FROM container WHERE container_id = 'FMBV1684747'").show()
2624

2725
# JOINs: Extract description of container codes
28-
codes = sq.read.json('data/iso-container-codes.json')
26+
codes = spark.read.json('data/iso-container-codes.json')
2927
codes.createOrReplaceTempView('codes')
3028
codes.printSchema()
3129
codes.show()
3230

33-
w_desc = sq.sql("SELECT c.container_id, s.code, s.description FROM container c JOIN codes s on c.container_type = s.code")
31+
w_desc = spark.sql("SELECT c.container_id, s.code, s.description FROM container c JOIN codes s on c.container_type = s.code")
3432
w_desc.show()
3533
print(w_desc.groupBy("code").count().take(3))
3634

0 commit comments

Comments
 (0)