I am using Scrapy and a Regex to parse some none standard web source code. I then wish to parse the first element of the dictionary returned:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from scrapy.spider import BaseSpider
from scrapy import log
from scrapy.cmdline import execute
from scrapy.utils.markup import remove_tags
import time
import re
import json
import requests
class ExampleSpider(CrawlSpider):
name = "goal2"
allowed_domains = ["whoscored.com"]
start_urls = ["http://www.whoscored.com"]
download_delay = 5
rules = [Rule(SgmlLinkExtractor(allow=('\Teams'),deny=(),), follow=False, callback='parse_item')]
def parse_item(self, response):
sel = Selector(response)
titles = sel.xpath("normalize-space(//title)")
print '-' * 170
myheader = titles.extract()[0]
print '********** Page Title:', myheader.encode('utf-8'), '**********'
print '-' * 170
match1 = re.search(re.escape("DataStore.prime('stage-player-stat', defaultTeamPlayerStatsConfigParams.defaultParams , ") \
+ '(\[.*\])' + re.escape(");"), response.body)
if match1 is not None:
playerdata1 = match1.group(1)
teamid = json.loads(playerdata1[0])
print teamid
The key for the first element of 'playerdata1' is called 'TeamId'. I assumed the above method would work, however I am getting the following error:
teamid = json.loads(playerdata1[0])
File "C:\Python27\lib\json\__init__.py", line 338, in loads
return _default_decoder.decode(s)
File "C:\Python27\lib\json\decoder.py", line 366, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Python27\lib\json\decoder.py", line 382, in raw_decode
obj, end = self.scan_once(s, idx)
exceptions.ValueError: Expecting object: line 1 column 1 (char 0)
Can anyone see what the issue is here?
match1.group(1)to be a JSON string? Tryteamid = json.loads(playerdata1)[0]instead?DataStore.primetext in it.