package packagename;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.hive.thriftserver.*;
public class SparkReadParquetAndRegTempTable {
public static void main(String[] args) throws ClassNotFoundException {
SparkConf spconf = new SparkConf();
spconf.set("spark.driver.maxResultSize", "3g");
JavaSparkContext sc=null;
try
{
sc = new JavaSparkContext(spconf);
//SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
HiveContext sqlContext = new org.apache.spark.sql.hive.HiveContext(sc.sc());
DataFrame df = sqlContext.read().parquet("/path/parquetFolderName.parquet");
df.printSchema();
//To Query the table via beeline as Spark Hive table
df.registerTempTable("tempTable_spark");
HiveThriftServer2.startWithContext(sqlContext);
}
catch(Exception e)
{
System.out.print("Error is"+e.toString());
}
}
}
How to submit the job
hadoop_classpath=$(hadoop classpath)
HBASE_CLASSPATH=$(hbase classpath)
sudo -u userName /spark/spark-1.5.2/bin/spark-submit --name tempSparkTable --class packageName.SparkReadParquetAndRegTempTable --master local[4] --num-executors 8 --executor-cores 8 --executor-memory 8G --conf "spark.executor.extraClassPath=${HBASE_CLASSPATH}" --conf "spark.driver.extraClassPath=${HBASE_CLASSPATH}" --conf "spark.executor.extraClassPath=${hadoop_classpath}"
--jars /path/projectName-0.0.1-SNAPSHOT-jar-with-dependencies.jar
/path/projectName-0.0.1-SNAPSHOT.jar
Make sure, there is no Hive Thrift server running in port 10000 in the machine, where you run this program
Connect via /opt/mapr/hive/hive-1.2/bin/beeline -u jdbc:hive2://serverName:10000 -n UserName
Show tables;
should list your table name in the list
tempTable_spark
Then you can run the queries against this temp table
Maven Dependencies:
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
<version>1.5.2</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.10</artifactId>
<version>1.5.2</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>0.98.12-hadoop2</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>0.98.12-hadoop2</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-examples</artifactId>
<version>0.98.12-hadoop2</version>
</dependency>
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>9.4.1208</version>
</dependency>
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-csv_2.11</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive-thriftserver_2.10</artifactId>
<version>1.5.2</version>
</dependency>
</dependencies>
No comments:
Post a Comment