Wednesday, July 20, 2016

Partition a parquet file using Apache Spark



SparkConf spconf = new SparkConf();

spconf.set("spark.driver.maxResultSize", "3g");

JavaSparkContext sc=null;
try
{
sc = new JavaSparkContext(spconf);

SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);

//HiveContext sqlContext = new org.apache.spark.sql.hive.HiveContext(sc.sc());

DataFrame df = sqlContext.read().parquet("/path/hive/warehouse/nextgen_rjr.db/source.parquet");

df.printSchema();

//df.write().partitionBy("col_A","col_B").parquet("/targetPath/source_partitioned.parquet");
df.write().partitionBy("col_A").parquet("/targetPath/source_partitioned_by_single_col.parquet");

}
finally
{
sc.close();
}

System.out.println("Partitioned Parquet File created");

No comments: