# Databricks notebook source # define variables import datetime currentdate = datetime.datetime.now().strftime("%Y_%m_%d") input_file = "/mnt/cointainer/folder/data_json/sub_subfolder_" + currentdate + ".json" output_path = "/mnt/cointainer/folder/data_parquet/sub_subfolder_" + currentdate # COMMAND ---------- #read json file json = spark.read.option("multiline", "true").json(input_file) #parse json from pyspark.sql.functions import explode, col items = json.select( explode("valid_retailers").alias("valid_names"), col("date_of_creation").alias("date_of_creation") ) childitems = items.select( col("valid_names.id").alias("id"), col("valid_names.name").alias("name"), col("valid_names.country").alias("country"), explode("valid_names.point_of_sales").alias("point"), col("date_of_creation").alias("date_of_creation") ) #display (childitems) # COMMAND ---------- # write to parquet #childitems.write.mode('overwrite').parquet(output_path) childitems.write.format("parquet").mode('overwrite').save(output_path) #parquet = spark.read.parquet(output_path) #display (parquet)