An alternative solution would be not to dive into regexes, but parse javascript code with a javascript code parser. Example using slimit:
SlimIt is a JavaScript minifier written in Python. It compiles
JavaScript into more compact code so that it downloads and runs
faster.
SlimIt also provides a library that includes a JavaScript parser,
lexer, pretty printer and a tree visitor.
from slimit import ast
from slimit.parser import Parser
from slimit.visitors import nodevisitor
data = """
var defaultTeamStatsConfigParams = {
data:{
url: 'stage-team-stat'
},
defaultParams: {
stageId : 9155,
field: 2,
teamId: 32
}
};
DataStore.prime('stage-team-stat', defaultTeamStatsConfigParams.defaultParams, [{"RegionId":252,"RegionCode":"gb-eng","TournamentName":"Premier League","TournamentId":2,"StageId":9155,"Field":{"Value":2,"DisplayName":"Overall"},"TeamName":"Manchester United","TeamId":32,"GamesPlayed":4,"Goals":6,"Yellow":7,"Red":0,"TotalPasses":2480,"Possession":247,"AccuratePasses":2167,"AerialWon":61,"AerialLost":49,"Rating":7.01,"DefensiveRating":7.01,"OffensiveRating":6.79,"ShotsConcededIBox":13,"ShotsConcededOBox":21,"TotalTackle":75,"Interceptions":71,"Fouls":54,"WasFouled":46,"TotalShots":49,"ShotsBlocked":9,"ShotsOnTarget":19,"Dribbles":44,"Offsides":3,"Corners":17,"Throws":73,"Dispossesed":36,"TotalClearance":78,"Turnover":0,"Ranking":0}]);
var stageStatsConfig = {
id: 'team-stage-stats',
singular: true,
filter: {
instanceType: WS.Filter,
id: 'team-stage-stats-filter',
categories: { data: [{ value: 'field' }] },
singular: true
},
params: defaultTeamStatsConfigParams,
content: {
instanceType: TeamStageStats,
view: {
renderTo: 'team-stage-stats-content'
}
}
};
var stageStats = new WS.Panel(stageStatsConfig);
stageStats.load();
"""
parser = Parser()
tree = parser.parse(data)
fields = {getattr(node.left, 'value', ''): getattr(node.right, 'value', '')
for node in nodevisitor.visit(tree)
if isinstance(node, ast.Assign)}
print fields['stageId'], fields['field'], fields['teamId']
Prints 9155 2 32.
Here we are iterating over the syntax tree nodes and constructing a dictionary from all assignments. Among them we have stageId, fields and teamId.
Here is how you can apply the solution to your scrapy spider:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from slimit import ast
from slimit.parser import Parser
from slimit.visitors import nodevisitor
def get_fields(data):
parser = Parser()
tree = parser.parse(data)
return {getattr(node.left, 'value', ''): getattr(node.right, 'value', '')
for node in nodevisitor.visit(tree)
if isinstance(node, ast.Assign)}
class ExampleSpider(CrawlSpider):
name = "goal2"
allowed_domains = ["whoscored.com"]
start_urls = ["http://www.whoscored.com/Teams/32/Statistics/England-Manchester-United"]
download_delay = 5
rules = [Rule(SgmlLinkExtractor(allow=('http://www.whoscored.com/Teams/32/Statistics/England-Manchester-United'),deny=('/News', '/Graphics', '/Articles', '/Live', '/Matches', '/Explanations', '/Glossary', 'ContactUs', 'TermsOfUse', 'Jobs', 'AboutUs', 'RSS'),), follow=False, callback='parse_item')]
def parse_item(self, response):
sel = Selector(response)
titles = sel.xpath("normalize-space(//title)")
myheader = titles.extract()[0]
script = sel.xpath('//div[@id="team-stage-stats"]/following-sibling::script/text()').extract()[0]
script_fields = get_fields(script)
print script_fields['stageId'], script_fields['field'], script_fields['teamId']
scrapybeing involved here. Cause, strictly speaking, the question is not scrapy-specific, but more about "how to extract certain fields from a string that is a snippet of javascript code".re.compile, you get back a regex object that you can use directly:stagematch.match(var), notre.match(stagematch, var).re.searchreturnsNone. So, it's not actually demonstrating your problem at all.