Im trying to make a webcrawler that goes to a link and waits for the Javascript content to load. Then it should get all the links to the listed articles, before proceeding to the next page. The problem is it always scrapes from the first url ("https://techcrunch.com/search/heartbleed") instead of following the ones i gave it. Why does the following code not scrape from the new urls i passed in the reqeusts ? I am out of ideas...
import scrapy
from scrapy.http.request import Request
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
import time
class TechcrunchSpider(scrapy.Spider):
name = "techcrunch_spider_performance"
allowed_domains = ['techcrunch.com']
start_urls = ['https://techcrunch.com/search/heartbleed']
def __init__(self):
self.driver = webdriver.PhantomJS()
self.driver.set_window_size(1120, 550)
#self.driver = webdriver.Chrome("C:\Users\Daniel\Desktop\Sonstiges\chromedriver.exe")
self.driver.wait = WebDriverWait(self.driver, 5) #wartet bis zu 5 sekunden
def parse(self, response):
start = time.time() #ZEITMESSUNG
self.driver.get(response.url)
#wartet bis zu 5 sekunden(oben definiert) auf den eintritt der condition, danach schmeist er den TimeoutException error
try:
self.driver.wait.until(EC.presence_of_element_located(
(By.CLASS_NAME, "block-content")))
print("Found : block-content")
except TimeoutException:
self.driver.close()
print(" block-content NOT FOUND IN TECHCRUNCH !!!")
#Crawle durch Javascript erstellte Inhalte mit Selenium
ahref = self.driver.find_elements(By.XPATH,'//h2[@class="post-title st-result-title"]/a')
hreflist = []
#Alle Links zu den jeweiligen Artikeln sammeln
for elem in ahref :
hreflist.append(elem.get_attribute("href"))
for elem in hreflist :
print(elem)
yield scrapy.Request(url=elem , callback=self.parse_content)
#Den link fuer die naechste seite holen
try:
next = self.driver.find_element(By.XPATH,"//a[@class='page-link next']")
nextpage = next.get_attribute("href")
print("JETZT KOMMT NEXT :")
print(nextpage)
#newresponse = response.replace(url=nextpage)
yield scrapy.Request(url=nextpage, dont_filter=False)
except TimeoutException:
self.driver.close()
print(" NEXT NOT FOUND(OR EOF) IM CLOSING MYSELF !!!")
end = time.time()
print("Time elapsed : ")
finaltime = end-start
print(finaltime)
def parse_content(self, response):
title = self.driver.find_element(By.XPATH,"//h1")
titletext = title.get_attribute("innerHTML")
print(" h1 : ")
print(title)
print(titletext)