0

I am trying to perfect this web scraper that uses Chrome WebDriver to scrape pages. It currently keeps breaking at line 74:

soup = BeautifulSoup(HTML, "html.parser")

with the error code:

AttributeError: 'str' object has no attribute 'text'.

How can I fix this? I'm not sure why it keeps breaking at that point.

    import urllib2, sys
    from BeautifulSoup import BeautifulSoup
    from datetime import datetime
    import requests
    from lxml import html
    import traceback
    import csv
    import time
    import json

    import selenium
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.keys import Keys

    username = "user" # your email here
    password = "pass" # your password here

    pages = 10
    companyName = "Apple"
    companyURL = "https://www.glassdoor.com/Reviews/Apple-US-Reviews-EI_IE7438.0,6_IL.7,9_IN1.htm?filter.defaultEmploymentStatuses=false&filter.defaultLocation=false"

    def obj_dict(obj):
        return obj.__dict__
    #enddef

    def json_export(data):
      jsonFile = open(companyName + ".json", "w")
      jsonFile.write(json.dumps(data, indent=4, separators=(',', ': '), default=obj_dict))
      jsonFile.close()
    #enddef

    def init_driver():
        driver = webdriver.Chrome('C:\Python27\chromedriver.exe')
        driver.wait = WebDriverWait(driver, 10)
        return driver
    #enddef

    def login(driver, username, password):
        driver.get("http://www.glassdoor.com/profile/login_input.htm")
        try:
            user_field = driver.wait.until(EC.presence_of_element_located(
                (By.NAME, "username")))
            pw_field = driver.find_element_by_class_name("signin-password")
            login_button = driver.find_element_by_id("signInBtn")
            user_field.send_keys(username)
            user_field.send_keys(Keys.TAB)
            time.sleep(1)
            pw_field.send_keys(password)
            time.sleep(1)
            login_button.click()
        except TimeoutException:
            print("TimeoutException! Username/password field or login button not found on glassdoor.com")
    #enddef

    ###
    def get_data(driver, URL, startPage, endPage, data, refresh):
      if (startPage > endPage):
        return data
      #endif
      print "\nPage " + str(startPage) + " of " + str(endPage)
      currentURL = URL + "_IP" + str(startPage) + ".htm"
      time.sleep(2)
      #endif
      if (refresh):
        driver.get(currentURL)
        print "Getting " + currentURL
      #endif
      time.sleep(2)
      HTML = driver.page_source
      soup = BeautifulSoup(HTML, "html.parser")
      reviews = soup.find_all("li", { "class" : ["empReview", "padVert"] })
      if (reviews):
        data = parse_reviews_HTML(reviews, data)
        print "Page " + str(startPage) + " scraped."
        if (startPage % 10 == 0):
          print "\nTaking a breather for a few seconds ..."
          time.sleep(10)
        #endif
        get_data(driver, URL, startPage + 1, endPage, data, True)
      else:
        print "Waiting ... page still loading or CAPTCHA input required"
        time.sleep(3)
        get_data(driver, URL, startPage, endPage, data, False)
      #endif
      return data
    #enddef

    if __name__ == "__main__":
      driver = init_driver()
      time.sleep(3)
      print "Logging into Glassdoor account ..."
      login(driver, username, password)
      time.sleep(5)
      print "\nStarting data scraping ..."
      data = get_data(driver, companyURL[:-4], 1, pages, [], True)
      print "\nExporting data to " + Apple + ".json"
      json_export(data)
      driver.quit()
    #endif


    summary_box = soup.find('span', attrs={'class': 'summary '})

    summary = summary_box.text.strip()
    print summary
3
  • have you tried using a different parser? Commented Nov 7, 2017 at 21:58
  • I have not. I thought about using the html5 one or lxml. do you have any suggestions on which would be best to employ here? Commented Nov 7, 2017 at 22:24
  • I usually just do trial and error till I find one that works, though lxml is the most common one it seems. Commented Nov 7, 2017 at 23:21

1 Answer 1

1

You are probably using BeautifulSoup version 3(I tried with it and the problem occured). Even if that's not the case try removing the "html.parser" argument, make it just:

soup = BeautifulSoup(HTML)

I hope it'll work.)

Sign up to request clarification or add additional context in comments.

1 Comment

It worked! Thanks so much! I updated to bs4 instead. Didn't realize I had 3. Now to debug the rest of the code.... -__-

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.