I have a pyspark code that is written which reads three JSON files and converts the JSON files to DataFrames and the DataFrames are converted to tables on which SQL queries are performed.
import pyspark.sql
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import *
from pyspark.sql import Row
import json
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.types import *
spark = SparkSession \
.builder \
.appName("project") \
.getOrCreate()
sc = spark.sparkContext
sqlContext=SQLContext(sc)
reviewFile= sqlContext.read.json("review.json")
usersFile=sqlContext.read.json("user.json")
businessFile=sqlContext.read.json("business.json")
reviewFile.createOrReplaceTempView("review")
usersFile.createOrReplaceTempView("user")
businessFile.createOrReplaceTempView("business")
review_user = spark.sql("select r.review_id,r.user_id,r.business_id,r.stars,r.date,u.name,u.review_count,u.yelping_since from (review r join user u on r.user_id = u.user_id)")
review_user.createOrReplaceTempView("review_user")
review_user_business= spark.sql("select r.review_id,r.user_id,r.business_id,r.stars,r.date,r.name,r.review_count,r.yelping_since,b.address,b.categories,b.city,b.latitude,b.longitude,b.name,b.neighborhood,b.postal_code,b.review_count,b.stars,b.state from review_user r join business b on r.business_id= b.business_id")
review_user_business.createOrReplaceTempView("review_user_business")
#categories= spark.sql("select distinct(categories) from review_user_business")
categories= spark.sql("select distinct(r.categories) from review_user_business r where 'Food' in r.categories")
print categories.show(50)
You guys can find the description of the data in the below link. https://www.yelp.com/dataset/documentation/json
What I'm trying to do is get the rows which has food as a part of its category. Can some one help me with it??