Web scraping HTML table with certain text in Python

Question

I am trying to web scrape a HTML table using python. There are many tables in the HTML page, but i want to scrape a certain table only. I am using beautiful soup to do this web scraping.

My code looks like this:

page = get("http://uobgoldprice.com/history/2018/September/10/")
html = BeautifulSoup(page.content, 'html.parser')

for p in html.select('tr'):
    if p.text == "ARGOR CAST BAR":
        print (p.text)

I would like only the table that reads "Rate as at Monday, 10 September 2018".

How do I go about doing that?

This text is always the same? you could grab all the tables and check for the class 'title_table' matches the text you are looking for. — Progs
– Progs, Commented Sep 14, 2018 at 17:51

Dani Mesejo · Accepted Answer · 2018-09-14 17:52:40Z

2

You need to find the elements that contains the text and the parent that is a table:

import re
import requests
from bs4 import BeautifulSoup

page = requests.get("http://uobgoldprice.com/history/2018/September/10/")
html = BeautifulSoup(page.content, 'html.parser')

element = html.find(text=re.compile('Rate as at Monday, 10 September 2018'))
print(element.findParent('table'))

answered Sep 14, 2018 at 17:52

Dani Mesejo

62.1k6 gold badges56 silver badges86 bronze badges

Sign up to request clarification or add additional context in comments.

Comments

G_M · Accepted Answer · 2018-09-14 17:55:29Z

from collections import defaultdict

import requests
from bs4 import BeautifulSoup


def get_page_html(url):
    r = requests.get(url)
    r.raise_for_status()
    return r.text


def parse_last_table(html):
    prev_key = None
    result = defaultdict(list)
    soup = BeautifulSoup(html, 'lxml')

    last_table = soup.find_all('table')[-1]
    for row in last_table.find_all('tr')[2:]:
        try:
            description, currency, unit, bank_sells, bank_buys = (
                col.text.strip() for col in row.find_all('td')
            )
        except ValueError:
            continue  # blank/empty row

        description = description or prev_key
        result[description].append({
            'currency': currency,
            'unit': unit,
            'bank_sells': bank_sells,
            'bank_buys': bank_buys
        })
        prev_key = description
    return result

Output:

>>> url = 'http://uobgoldprice.com/history/2018/September/10/'
>>> page_html = get_page_html(url)
>>> result = parse_last_table(page_html)
>>> import json; print(json.dumps(result, indent=2))
{
  "ARGOR CAST BAR": [
    {
      "currency": "SGD",
      "unit": "100 GM",
      "bank_sells": "5,369.00 (+4.00)",
      "bank_buys": "5,291.00 (+3.00)"
    }
  ],
  "CAST BARS": [
    {
      "currency": "SGD",
      "unit": "1 KILOBAR",
      "bank_sells": "53,201.00 (+36.00)",
      "bank_buys": "52,933.00 (+36.00)"
    }
  ],
  "GOLD CERTIFICATE": [
    {
      "currency": "SGD",
      "unit": "1 KILOCERT",
      "bank_sells": "53,201.00 (+36.00)",
      "bank_buys": "52,933.00 (+36.00)"
    }
  ],
  "GOLD SAVINGS A/C": [
    {
      "currency": "SGD",
      "unit": "1 GM",
      "bank_sells": "53.20 (+0.04)",
      "bank_buys": "52.94 (+0.04)"
    }
  ],
  "GOLD BULLION COINS": [
    {
      "currency": "SGD",
      "unit": "1/20 OZ(GNC,SLC &GML)",
      "bank_sells": "131.00",
      "bank_buys": "81.00"
    },
    {
      "currency": "SGD",
      "unit": "1/10 OZ",
      "bank_sells": "211.00 (+1.00)",
      "bank_buys": "163.00"
    },
    {
      "currency": "SGD",
      "unit": "1/4 OZ",
      "bank_sells": "465.00",
      "bank_buys": "410.00"
    },
    {
      "currency": "SGD",
      "unit": "1/2 OZ",
      "bank_sells": "904.00 (+1.00)",
      "bank_buys": "822.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "1,726.00 (+1.00)",
      "bank_buys": "1,645.00 (+1.00)"
    }
  ],
  "PAMP GOLD BARS": [
    {
      "currency": "SGD",
      "unit": "1/2 OZ",
      "bank_sells": "876.00",
      "bank_buys": "821.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "1 GM",
      "bank_sells": "82.00",
      "bank_buys": "50.00"
    },
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "1,711.00 (+1.00)",
      "bank_buys": "1,644.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "2.5 GM",
      "bank_sells": "182.00",
      "bank_buys": "130.00"
    },
    {
      "currency": "SGD",
      "unit": "5 GM",
      "bank_sells": "322.00",
      "bank_buys": "262.00"
    },
    {
      "currency": "SGD",
      "unit": "10 GM",
      "bank_sells": "597.00 (+1.00)",
      "bank_buys": "527.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "20 GM",
      "bank_sells": "1,132.00 (+1.00)",
      "bank_buys": "1,056.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "50 GM",
      "bank_sells": "2,746.00 (+2.00)",
      "bank_buys": "2,644.00 (+2.00)"
    },
    {
      "currency": "SGD",
      "unit": "100 GM",
      "bank_sells": "5,414.00 (+3.00)",
      "bank_buys": "5,291.00 (+3.00)"
    }
  ],
  "SILVER PASSBOOK ACCOUNT": [
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "19.86 (+0.09)",
      "bank_buys": "19.30 (+0.09)"
    }
  ]
}

Umair · Accepted Answer · 2018-09-14 18:38:16Z

I believe this code will help you. If you want complete running project visit html to pdf Web scraping

import logging
import math
import json
from flask import jsonify, abort, make_response
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import pydf

from constants import Constants
from response import Response


class SeleniumCrawler(object):

    def get_page(self, url):
        response = Response()
        try:

            # Initilized the chrome driver
            print("Initilized the chrome driver")
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--window-size=1420,1080')
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--disable-gpu')
            browser = webdriver.Chrome(chrome_options=chrome_options)

            # browser url
            browser.get(url)
            delay = 10000

            # wait till specific classes appears
            print("wait till specific classes appears")
            WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'kbn-table')))
            body = browser.find_element_by_class_name("kbn-table").get_attribute('innerHTML')

            # calculate number of pages exists and loop them
            print("calculate number of pages exists and loop them")
            pages = (str(browser.find_element_by_class_name("kuiToolBarText").text).split(" ")[2]).replace(",","")
            pages = math.ceil(int(pages) / 50) - 1

            print("pages found {}".format(pages))
            for page in range(1, pages):

browser.execute_script("document.getElementsByClassName('kuiButton')[1].click()")
               chunk = browser.find_element_by_class_name("kbn-table").get_attribute('innerHTML').replace("<tbody>", "")
               body += chunk`enter code here`

            # apply table tags and generate pdf
            print("apply table tags and generate pdf")
            pdf = pydf.generate_pdf("<table>" + body + "</table>")
            with open('out.pdf', 'wb') as f:
                f.write(pdf)

            return json.loads(json.dumps((response.get_response(Constants.SUCCESS, Constants.SUCCESS))))
        except Exception as e:
            logging.exception(e)

            return abort(make_response(jsonify(response.get_response(Constants.SERVER_ERROR, Constants.SERVER_ERROR)), response.get_code(Constants.SERVER_ERROR)))

What would be the purpose of using selenium on the particular url OP provided? The table isn't being loaded by javascript?
There were some security issues for my url. At the time of load, there was no data just an empty iframe. After sometime data use to appear in that iframe

Collectives™ on Stack Overflow

Web scraping HTML table with certain text in Python

3 Answers 3

Comments

Comments

2 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

3 Answers 3

Comments

Comments

2 Comments

Your Answer

Sign up or log in

Post as a guest

Related