Python Selenium, scraping webpage javascript table

Question

I am going to scrap the javascript tables inside below link. http://data2.7m.cn/history_Matches_Data/2009-2010/92/en/index.shtml

import codecs
import lxml.html as lh
from lxml import etree
import requests
from selenium import webdriver
import urllib2
from bs4 import BeautifulSoup

URL = 'http://data2.7m.cn/history_Matches_Data/2009-2010/92/en/index.shtml'
profile = webdriver.FirefoxProfile()
profile.set_preference('network.http.max-connections', 30)
profile.update_preferences()
browser = webdriver.Firefox(profile)
browser.get(URL)
content = browser.page_source
soup = BeautifulSoup(''.join(content))

When I get the contents of the webpage, then I need to know the number of round of soccer matches in that particular league.

Below codes has only found out the only table, may I know how to get all 38 soccer matches' tables? Thank you.

# scrap the round of soccer matches
soup.findAll('td', attrs={'class': 'lsm2'})

# print the soccer matches' result of default round, but there have 38 rounds (id from s1 to s38)
print soup.find("div", {"id": "Match_Table"}).prettify()

ケンジリュスケ · Accepted Answer · 2013-11-10 04:48:29Z

# ============================================================
import codecs
import lxml.html as lh
from lxml import etree
import requests
from selenium import webdriver
import urllib2
from bs4 import BeautifulSoup
from pandas import DataFrame, Series
import html5lib

URL = 'http://data2.7m.cn/history_Matches_Data/2009-2010/92/en/index.shtml'
profile = webdriver.FirefoxProfile()
profile.set_preference('network.http.max-connections', 30)
profile.update_preferences()
browser = webdriver.Firefox(profile)
browser.get(URL)

content = browser.page_source
soup = BeautifulSoup(''.join(content))
# num = soup.findAll('td', attrs={'class': 'lsm2'})
# num = soup.findAll('table')[2].findAll('td')[37].text
# soup.findAll('table',attrs={'class':'e_run_tb'})

    num1 = soup.findAll('table')[2].findAll('tr')
    for i in range(1,len(num1)+1):
        for j in range(1,len(num1[i-1])+1):
            # click button on website
            clickme = browser.find_element_by_xpath('//*[@id="e_run_tb"]/tbody/tr'+'['+str(i)+']'+'/td'+'['+str(j)+']')
            clickme.click()

            content = browser.page_source
            soup = BeautifulSoup(''.join(content))

            table = soup.find('div', attrs={'class': 'e_matches'})
            rows = table.findAll('tr')
#           for tr in rows:
#             cols = tr.findAll('td')
#             for td in cols:
#                    text = td.find(text=True)
#                    print text,
#                print
            for tr in rows[5:16]: #from row 5 to 16
                cols = tr.findAll('td')
                for td in cols:
                    text = td.find(text=True)
                    print text,
                print
            print

Vidya · Accepted Answer · 2013-11-02 04:37:14Z

0

The easiest thing might me to use Selenium to click the lsm2 links from 2-38 (since 1 is present to start) and then scrape the tables with id Match_Table after each click--accumulating your results as you go.

answered Nov 2, 2013 at 4:37

Vidya

30.4k7 gold badges48 silver badges73 bronze badges

1 Comment

ケンジリュスケ Over a year ago

but it seems time consumed as there are quite alot of English soccer leagues need to be scrapped. Any better idea? :)

Collectives™ on Stack Overflow

Python Selenium, scraping webpage javascript table

2 Answers 2

Comments

1 Comment

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

Comments

1 Comment

Your Answer

Sign up or log in

Post as a guest

Linked

Related