I've been trying to extract data from script tag in Kbb's HTML using Scrapy(xpath). But my main issue is with identifying the correct div and script tags. I'm new to using xpath and would appreciate any help!
<script type="text/javascript" src="http://s1.kbb.com/combine/IncentivesPilotJs/949332058"></script>
<input type="hidden" id="ResaleValueUrl" value="/ymmt/resalevalue/?vehicleid=392396" />
<input type="hidden" id="Intent" value="buy-used" />
<!--[if lt IE 9]>
<script>
window.FlashCanvasOptions = {
swfPath: "/js/canvas/FlashCanvas/UCMarketMeter/"
};
</script>
<script type="text/javascript" src="http://s1.kbb.com/combine/YmmtMarketMeterFlashCanvasJs/795892638"></script>
<![endif]-->
<script type="text/javascript" src="http://s1.kbb.com/combine/YMMTOverview/1527402533"></script>
<script type="text/javascript" src="http://s1.kbb.com/combine/YmmtPricingOverviewBuyUsedJs/-1416499456"></script>
<script language="javascript" type="text/javascript">
$(document).ready(function() {
KBB.Vehicle.Pages.PricingOverview.Buyers.setup({
//Workaround until we get cross domain working for Flash
imageDir: window.FlashCanvasOptions ? "/Content/images" : "http://file.kelleybluebookimages.com/kbb/images/marketmeter",
vehicleId: "392396",
zipCode: "78701",
mileage: "10000",
intent: "buy-used",
priceType: "retail",
condition: "good",
options: "392396|53635|78701|100|10|",
price: "17074",
manufacturer: "Nissan",
model: "Altima",
year: "2014",
style: "2.5 S Sedan 4D",
category: "",
hasCpo: true,
meetsCpoReq: true,
showOthersPaid: false,
data: {
"values": {
"cpo": {
"priceMin": 17335.0,
"price": 18275.0,
"priceMax": 19214.0
},
"fpp": {
"priceMin": 15286.0,
"price": 17074.0,
"priceMax": 18861.0
},
"privatepartyexcellent": {
"priceMin": 0.0,
"price": 16064.0,
"priceMax": 0.0
},
"privatepartyfair": {
"priceMin": 0.0,
"price": 14081.0,
"priceMax": 0.0
},
"privatepartygood": {
"priceMin": 0.0,
"price": 15454.0,
"priceMax": 0.0
},
"privatepartyverygood": {
"priceMin": 0.0,
"price": 15715.0,
"priceMax": 0.0
},
"retail": {
"priceMin": 0.0,
"price": 17875.0,
"priceMax": 0.0
}
},
"timAmount": 0.0,
"monthlyPayments": {
"cpo": {
"vehiclePrice": 18275.0,
"rate": 2.9,
"terms": 60.0,
"taxAndTitle": 6.5,
"downPay": 0.0,
"amount": 348.0
},
"fpp": {
"vehiclePrice": 17074.0,
"rate": 4.9,
"terms": 60.0,
"taxAndTitle": 6.5,
"downPay": 0.0,
"amount": 342.0
},
"privatepartyexcellent": {
"vehiclePrice": 16064.0,
"rate": 4.9,
"terms": 60.0,
"taxAndTitle": 6.5,
"downPay": 0.0,
"amount": 322.0
},
"privatepartyfair": {
"vehiclePrice": 14081.0,
"rate": 4.9,
"terms": 60.0,
"taxAndTitle": 6.5,
"downPay": 0.0,
"amount": 282.0
},
"privatepartygood": {
"vehiclePrice": 15454.0,
"rate": 4.9,
"terms": 60.0,
"taxAndTitle": 6.5,
"downPay": 0.0,
"amount": 309.0
},
"privatepartyverygood": {
"vehiclePrice": 15715.0,
"rate": 4.9,
"terms": 60.0,
"taxAndTitle": 6.5,
"downPay": 0.0,
"amount": 315.0
},
"retail": {
"vehiclePrice": 17875.0,
"rate": 4.9,
"terms": 60.0,
"taxAndTitle": 6.5,
"downPay": 0.0,
"amount": 358.0
}
},
"scale": {
"scaleLow": 14081.0,
"scaleHigh": 19214.0
},
"transactions": {
"below": 7,
"between": 17,
"above": 3
}
},
adPriceRanges: {"AdPriceRange":[{"PriceMin":0,"PriceMax":8499,"AdPRValue":1},{"PriceMin":8500,"PriceMax":18499,"AdPRValue":2},{"PriceMin":18500,"PriceMax":23499,"AdPRValue":3},{"PriceMin":23500,"PriceMax":28499,"AdPRValue":4},{"PriceMin":28500,"PriceMax":33499,"AdPRValue":5},{"PriceMin":33500,"PriceMax":38499,"AdPRValue":6},{"PriceMin":38500,"PriceMax":43499,"AdPRValue":7},{"PriceMin":43500,"PriceMax":48499,"AdPRValue":8},{"PriceMin":48500,"PriceMax":53499,"AdPRValue":9},{"PriceMin":53500,"PriceMax":63499,"AdPRValue":10},{"PriceMin":63500,"PriceMax":73499,"AdPRValue":11},{"PriceMin":73500,"PriceMax":1000000,"AdPRValue":12}]}});
});
$('.foot-note').hide();
$(window).on('popstate', function() {
KBB.Vehicle.Pages.PricingOverview.Buyers.stateChangeHandler();
});
</script>
Scrapy Code:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
import scrapy
from kbb.items import kbbItem
class kbbSpider(scrapy.Spider):
name = "kbb"
allowed_domains = ["kbb.com"]
start_urls = [
"http://www.kbb.com/nissan/altima/2014/25-s-sedan-4d/?vehicleid=392396&intent=buy-used&10000&good&pricetype=retail"
]
def parse(self, response):
sel=Selector(response)
#sites=sel.xpath('//div')
items=[]
#for site in sites:
item=kbbItem
item['priceMin']=site.xpath('//div/script').extract[35][915:922]
return items
I finally want to populate priceMin, price, priceMax from fpp and price from retail field into my items. Currently I'm using indices to get those values but was wondering if there is an easier way.