Wednesday, July 20, 2016

How to read a Parquet file and make a dataframe and create Hive temp table


package packagename;



import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;

import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.hive.thriftserver.*;



public class SparkReadParquetAndRegTempTable {


public static void main(String[] args) throws ClassNotFoundException {


SparkConf spconf = new SparkConf();

spconf.set("spark.driver.maxResultSize", "3g");

JavaSparkContext sc=null;
try
{
sc = new JavaSparkContext(spconf);

//SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);

HiveContext sqlContext = new org.apache.spark.sql.hive.HiveContext(sc.sc());


DataFrame df = sqlContext.read().parquet("/path/parquetFolderName.parquet");


df.printSchema();


//To Query the table via beeline as Spark Hive table

df.registerTempTable("tempTable_spark");

HiveThriftServer2.startWithContext(sqlContext);

}
catch(Exception e)
{
System.out.print("Error is"+e.toString());
}



}

 


}


How to submit the job

hadoop_classpath=$(hadoop classpath)
HBASE_CLASSPATH=$(hbase classpath)

sudo -u userName /spark/spark-1.5.2/bin/spark-submit   --name tempSparkTable     --class packageName.SparkReadParquetAndRegTempTable  --master local[4]   --num-executors 8    --executor-cores 8    --executor-memory 8G   --conf "spark.executor.extraClassPath=${HBASE_CLASSPATH}"   --conf "spark.driver.extraClassPath=${HBASE_CLASSPATH}"    --conf "spark.executor.extraClassPath=${hadoop_classpath}"
 --jars /path/projectName-0.0.1-SNAPSHOT-jar-with-dependencies.jar
/path/projectName-0.0.1-SNAPSHOT.jar

Make sure, there is no Hive Thrift server running in port 10000 in the machine, where you run this program


Connect via /opt/mapr/hive/hive-1.2/bin/beeline -u jdbc:hive2://serverName:10000 -n UserName

Show tables;

should list your table name in the list

tempTable_spark

Then you can run the queries against this temp table


Maven Dependencies:

<dependencies>

  <dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-sql_2.10</artifactId>
    <version>1.5.2</version>
</dependency>

 <dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-hive_2.10</artifactId>
    <version>1.5.2</version>
</dependency>

 <dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-client</artifactId>
    <version>0.98.12-hadoop2</version>
</dependency>

<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-common</artifactId>
    <version>0.98.12-hadoop2</version>
</dependency>

<dependency>
    <groupId>org.apache.hbase</groupId>
    <artifactId>hbase-examples</artifactId>
    <version>0.98.12-hadoop2</version>
</dependency>

<dependency>
    <groupId>org.postgresql</groupId>
    <artifactId>postgresql</artifactId>
    <version>9.4.1208</version>
</dependency>

<dependency>
    <groupId>com.databricks</groupId>
    <artifactId>spark-csv_2.11</artifactId>
    <version>1.2.0</version>
</dependency>

<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-hive-thriftserver_2.10</artifactId>
    <version>1.5.2</version>
</dependency>

</dependencies>





No comments: