I wish to scrape data from [a link]http://cbfcindia.gov.in/html/SearchDetails.aspx?mid=1&Loc=Backlog! , However the MID parameter is incremental in URL to give 2nd, 3rd URL ..... till 1000 URLs, so how shall I deal with this(I am new to PYTHON AND SCRAPY, so dont mind me asking this)?
Please check the XPATH i have used to extract the information, it is fetching no output, is there elementary error in the spider
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from movie.items import MovieItem
class MySpider(BaseSpider):
name = 'movie'
allowed_domains= ["http://cbfcindia.gov.in/"]
start_urls = ["http://cbfcindia.gov.in/html/SearchDetails.aspx?mid=1&Loc=Backlog"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//body") #Check
print titles
items = []
for titles in titles:
print "in FOR loop"
item = MovieItem()
item ["movie_name"]=hxs.xpath('//TABLE[@id="Table2"]/TR[2]/TD[2]/text()').extract()
print "XXXXXXXXXXXXXXXXXXXXXXXXX movie name:", item["movie_name"]
item ["movie_language"] = hxs.xpath('//*[@id="lblLanguage"]/text()').extract()
item ["movie_category"] = hxs.xpath('//*[@id="lblRegion"]/text()').extract()
item ["regional_office"] = hxs.xpath('//*[@id="lblCertNo"]/text()').extract()
item ["certificate_no"] = hxs.xpath('//*[@id="Label1"]/text()').extract()
item ["certificate_date"] = hxs.xpath('//*@id="lblCertificateLength"]/text()').extract()
item ["length"] = hxs.xpath('//*[@id="lblProducer"]/text()').extract()
item ["producer_name"] = hxs.xpath('//*[@id="lblProducer"]/text()').extract()
items.append(item)
print "this is ITEMS"
return items
Below is the log :
log>
{'certificate_date': [],
'certificate_no': [],
'length': [],
'movie_category': [],
'movie_language': [],
'movie_name': [],
'producer_name': [],
'regional_office': []}
2014-06-11 23:20:44+0530 [movie] INFO: Closing spider (finished)
214-06-11 23:20:44+0530 [movie] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 256,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 6638,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2014, 6, 11, 17, 50, 44, 54000),
'item_scraped_count': 1,
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2014, 6, 11, 17, 50, 43, 681000)}