2

I am going to scrap the javascript tables inside below link. http://data2.7m.cn/history_Matches_Data/2009-2010/92/en/index.shtml

import codecs
import lxml.html as lh
from lxml import etree
import requests
from selenium import webdriver
import urllib2
from bs4 import BeautifulSoup

URL = 'http://data2.7m.cn/history_Matches_Data/2009-2010/92/en/index.shtml'
profile = webdriver.FirefoxProfile()
profile.set_preference('network.http.max-connections', 30)
profile.update_preferences()
browser = webdriver.Firefox(profile)
browser.get(URL)
content = browser.page_source
soup = BeautifulSoup(''.join(content))

When I get the contents of the webpage, then I need to know the number of round of soccer matches in that particular league.

Below codes has only found out the only table, may I know how to get all 38 soccer matches' tables? Thank you.

# scrap the round of soccer matches
soup.findAll('td', attrs={'class': 'lsm2'})

# print the soccer matches' result of default round, but there have 38 rounds (id from s1 to s38)
print soup.find("div", {"id": "Match_Table"}).prettify()

2 Answers 2

2
# ============================================================
import codecs
import lxml.html as lh
from lxml import etree
import requests
from selenium import webdriver
import urllib2
from bs4 import BeautifulSoup
from pandas import DataFrame, Series
import html5lib

URL = 'http://data2.7m.cn/history_Matches_Data/2009-2010/92/en/index.shtml'
profile = webdriver.FirefoxProfile()
profile.set_preference('network.http.max-connections', 30)
profile.update_preferences()
browser = webdriver.Firefox(profile)
browser.get(URL)

content = browser.page_source
soup = BeautifulSoup(''.join(content))
# num = soup.findAll('td', attrs={'class': 'lsm2'})
# num = soup.findAll('table')[2].findAll('td')[37].text
# soup.findAll('table',attrs={'class':'e_run_tb'})

    num1 = soup.findAll('table')[2].findAll('tr')
    for i in range(1,len(num1)+1):
        for j in range(1,len(num1[i-1])+1):
            # click button on website
            clickme = browser.find_element_by_xpath('//*[@id="e_run_tb"]/tbody/tr'+'['+str(i)+']'+'/td'+'['+str(j)+']')
            clickme.click()

            content = browser.page_source
            soup = BeautifulSoup(''.join(content))

            table = soup.find('div', attrs={'class': 'e_matches'})
            rows = table.findAll('tr')
#           for tr in rows:
#             cols = tr.findAll('td')
#             for td in cols:
#                    text = td.find(text=True)
#                    print text,
#                print
            for tr in rows[5:16]: #from row 5 to 16
                cols = tr.findAll('td')
                for td in cols:
                    text = td.find(text=True)
                    print text,
                print
            print
Sign up to request clarification or add additional context in comments.

Comments

0

The easiest thing might me to use Selenium to click the lsm2 links from 2-38 (since 1 is present to start) and then scrape the tables with id Match_Table after each click--accumulating your results as you go.

1 Comment

but it seems time consumed as there are quite alot of English soccer leagues need to be scrapped. Any better idea? :)

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.