0

I am thinking where the problem is in my code

from queue import Queue
from threading import Thread
from html.parser import HTMLParser
import urllib.request

hosts = ["http://yahoo.com", "http://google.com", "http://ibm.com"]

queue = Queue()

class ThreadUrl(Thread):
   def __init__(self, queue):
       Thread.__init__(self)
       self.queue = queue

   def run(self):
      while True:
         host = self.queue.get()
         url=urllib.request.urlopen(host)
         url.read(4096)
         self.queue.task_done()


class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        print("Start tag:", tag)
        for attr in attrs:
            print("     attr:", attr)



def consumer():
    for i in range(3):
        t = ThreadUrl(queue)
        t.setDaemon(True)
        t.start()

    for host in hosts:
        parser = MyHTMLParser()
        parser.feed(host)
        queue.put(host) 
    queue.join()

consumer()

My goal is to extract content of the URLS,read the queue and finally parse it.When I execute the code it does not print anything.Where should I place the parser?

3
  • parser.feed(host) has no sense, you need to call feed method with the HTML returned by url.read(4096). Commented Sep 26, 2017 at 6:37
  • @lcastillov I understand now,but should I make new class or what? Commented Sep 26, 2017 at 6:38
  • Use the parser inside the run method, and just insert URLs in the queue. Create a MyHTMLParser class inside the ThreadUrl.run method and process incoming hosts. Commented Sep 26, 2017 at 6:40

1 Answer 1

1

Here is an example:

from queue import Queue
from threading import Thread
from html.parser import HTMLParser
import urllib.request


NUMBER_OF_THREADS = 3


HOSTS = ["http://yahoo.com", "http://google.com", "http://ibm.com"]


class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        print("Start tag:", tag)
        for attr in attrs:
            print("\tattr:", attr)


class ThreadUrl(Thread):
   def __init__(self, queue):
       Thread.__init__(self)
       self.queue = queue

   def run(self):
       while True:
           host = self.queue.get()
           url = urllib.request.urlopen(host)
           content = str(url.read(4096))
           parser = MyHTMLParser()
           parser.feed( content )
           self.queue.task_done()


def consumer():
    queue = Queue()
    for i in range(NUMBER_OF_THREADS):
        thread = ThreadUrl(queue)
        thread.setDaemon(True)
        thread.start()
    for host in HOSTS:
        queue.put(host) 
    queue.join()


if __name__ == '__main__':
    consumer()
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.