3

I am trying to web scrape a HTML table using python. There are many tables in the HTML page, but i want to scrape a certain table only. I am using beautiful soup to do this web scraping.

My code looks like this:

page = get("http://uobgoldprice.com/history/2018/September/10/")
html = BeautifulSoup(page.content, 'html.parser')

for p in html.select('tr'):
    if p.text == "ARGOR CAST BAR":
        print (p.text)

I would like only the table that reads "Rate as at Monday, 10 September 2018".

How do I go about doing that?

1
  • This text is always the same? you could grab all the tables and check for the class 'title_table' matches the text you are looking for. Commented Sep 14, 2018 at 17:51

3 Answers 3

2

You need to find the elements that contains the text and the parent that is a table:

import re
import requests
from bs4 import BeautifulSoup

page = requests.get("http://uobgoldprice.com/history/2018/September/10/")
html = BeautifulSoup(page.content, 'html.parser')

element = html.find(text=re.compile('Rate as at Monday, 10 September 2018'))
print(element.findParent('table'))
Sign up to request clarification or add additional context in comments.

Comments

1
from collections import defaultdict

import requests
from bs4 import BeautifulSoup


def get_page_html(url):
    r = requests.get(url)
    r.raise_for_status()
    return r.text


def parse_last_table(html):
    prev_key = None
    result = defaultdict(list)
    soup = BeautifulSoup(html, 'lxml')

    last_table = soup.find_all('table')[-1]
    for row in last_table.find_all('tr')[2:]:
        try:
            description, currency, unit, bank_sells, bank_buys = (
                col.text.strip() for col in row.find_all('td')
            )
        except ValueError:
            continue  # blank/empty row

        description = description or prev_key
        result[description].append({
            'currency': currency,
            'unit': unit,
            'bank_sells': bank_sells,
            'bank_buys': bank_buys
        })
        prev_key = description
    return result

Output:

>>> url = 'http://uobgoldprice.com/history/2018/September/10/'
>>> page_html = get_page_html(url)
>>> result = parse_last_table(page_html)
>>> import json; print(json.dumps(result, indent=2))
{
  "ARGOR CAST BAR": [
    {
      "currency": "SGD",
      "unit": "100 GM",
      "bank_sells": "5,369.00 (+4.00)",
      "bank_buys": "5,291.00 (+3.00)"
    }
  ],
  "CAST BARS": [
    {
      "currency": "SGD",
      "unit": "1 KILOBAR",
      "bank_sells": "53,201.00 (+36.00)",
      "bank_buys": "52,933.00 (+36.00)"
    }
  ],
  "GOLD CERTIFICATE": [
    {
      "currency": "SGD",
      "unit": "1 KILOCERT",
      "bank_sells": "53,201.00 (+36.00)",
      "bank_buys": "52,933.00 (+36.00)"
    }
  ],
  "GOLD SAVINGS A/C": [
    {
      "currency": "SGD",
      "unit": "1 GM",
      "bank_sells": "53.20 (+0.04)",
      "bank_buys": "52.94 (+0.04)"
    }
  ],
  "GOLD BULLION COINS": [
    {
      "currency": "SGD",
      "unit": "1/20 OZ(GNC,SLC &GML)",
      "bank_sells": "131.00",
      "bank_buys": "81.00"
    },
    {
      "currency": "SGD",
      "unit": "1/10 OZ",
      "bank_sells": "211.00 (+1.00)",
      "bank_buys": "163.00"
    },
    {
      "currency": "SGD",
      "unit": "1/4 OZ",
      "bank_sells": "465.00",
      "bank_buys": "410.00"
    },
    {
      "currency": "SGD",
      "unit": "1/2 OZ",
      "bank_sells": "904.00 (+1.00)",
      "bank_buys": "822.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "1,726.00 (+1.00)",
      "bank_buys": "1,645.00 (+1.00)"
    }
  ],
  "PAMP GOLD BARS": [
    {
      "currency": "SGD",
      "unit": "1/2 OZ",
      "bank_sells": "876.00",
      "bank_buys": "821.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "1 GM",
      "bank_sells": "82.00",
      "bank_buys": "50.00"
    },
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "1,711.00 (+1.00)",
      "bank_buys": "1,644.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "2.5 GM",
      "bank_sells": "182.00",
      "bank_buys": "130.00"
    },
    {
      "currency": "SGD",
      "unit": "5 GM",
      "bank_sells": "322.00",
      "bank_buys": "262.00"
    },
    {
      "currency": "SGD",
      "unit": "10 GM",
      "bank_sells": "597.00 (+1.00)",
      "bank_buys": "527.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "20 GM",
      "bank_sells": "1,132.00 (+1.00)",
      "bank_buys": "1,056.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "50 GM",
      "bank_sells": "2,746.00 (+2.00)",
      "bank_buys": "2,644.00 (+2.00)"
    },
    {
      "currency": "SGD",
      "unit": "100 GM",
      "bank_sells": "5,414.00 (+3.00)",
      "bank_buys": "5,291.00 (+3.00)"
    }
  ],
  "SILVER PASSBOOK ACCOUNT": [
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "19.86 (+0.09)",
      "bank_buys": "19.30 (+0.09)"
    }
  ]
}

Comments

0

I believe this code will help you. If you want complete running project visit html to pdf Web scraping

import logging
import math
import json
from flask import jsonify, abort, make_response
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import pydf

from constants import Constants
from response import Response


class SeleniumCrawler(object):

    def get_page(self, url):
        response = Response()
        try:

            # Initilized the chrome driver
            print("Initilized the chrome driver")
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--window-size=1420,1080')
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--disable-gpu')
            browser = webdriver.Chrome(chrome_options=chrome_options)

            # browser url
            browser.get(url)
            delay = 10000

            # wait till specific classes appears
            print("wait till specific classes appears")
            WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'kbn-table')))
            body = browser.find_element_by_class_name("kbn-table").get_attribute('innerHTML')

            # calculate number of pages exists and loop them
            print("calculate number of pages exists and loop them")
            pages = (str(browser.find_element_by_class_name("kuiToolBarText").text).split(" ")[2]).replace(",","")
            pages = math.ceil(int(pages) / 50) - 1

            print("pages found {}".format(pages))
            for page in range(1, pages):

browser.execute_script("document.getElementsByClassName('kuiButton')[1].click()")
               chunk = browser.find_element_by_class_name("kbn-table").get_attribute('innerHTML').replace("<tbody>", "")
               body += chunk`enter code here`

            # apply table tags and generate pdf
            print("apply table tags and generate pdf")
            pdf = pydf.generate_pdf("<table>" + body + "</table>")
            with open('out.pdf', 'wb') as f:
                f.write(pdf)

            return json.loads(json.dumps((response.get_response(Constants.SUCCESS, Constants.SUCCESS))))
        except Exception as e:
            logging.exception(e)

            return abort(make_response(jsonify(response.get_response(Constants.SERVER_ERROR, Constants.SERVER_ERROR)), response.get_code(Constants.SERVER_ERROR)))

2 Comments

What would be the purpose of using selenium on the particular url OP provided? The table isn't being loaded by javascript?
There were some security issues for my url. At the time of load, there was no data just an empty iframe. After sometime data use to appear in that iframe

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.