Get retrieve XML attrib using ElementTree Python

Question

I am trying to get an elements attribute but all I am getting is a None value or empty list depending on what I try to get it. Also if anyone knows a better way to get the particular tag for the element I would appreciate it. Here is the code and the spaced out part is what should return the url but doesnt.

import xml.etree.ElementTree as ET
import webbrowser,time,urllib.request
import tkinter as tk
import urllib

# webbrowser.get('windows-default').open_new('http://www.reddit.com/'+'r/blender')
main = tk.Tk()
class Application(tk.Frame):



    def __init__(self, master=None):
        tk.Frame.__init__(self, master)
        self.pack()
        self.createWidgets()
        self.initial()

    def createWidgets(self):
        # print('Went to createWidgets()')
        self.send_entry = tk.Entry(self)
        self.send_entry.grid(row=0,column=0)
        self.change_sub = tk.Button(self,text='Change Subreddit', command=lambda :self.getXML(self.send_entry.get())).grid(row=0 , column=2)
        self.lb_scrollY = tk.Scrollbar(self,orient=tk.VERTICAL)
        self.lb_scrollY.grid(row=1,column=1,sticky=tk.NS)
        self.thread_lb = tk.Listbox(self,yscrollcommand=self.lb_scrollY.set)
        self.lb_scrollY['command']=self.thread_lb.yview
        self.thread_lb.grid(row=1,column=0)
        self.QUIT = tk.Button(self, text="QUIT", fg="red", command=main.destroy).grid(row=2)




    def descStripper(self,desc):
        x1=int(desc.find('alt="'))
        if x1 != -1:
            x2Start = x1+5
            x2=int(desc.find('"',x2Start))
            desc = desc[x1+5:x2]
            return desc
        else:
            desc = "There is no description. Maybe it's a link"
            return desc

    def lbPopulator(self,title,pub,link):
        # print('Went to lbPopulator()')
        self.thread_lb.delete(0,tk.END)
        for item in title:
            self.thread_lb.insert(tk.END,item)

    def getXmlData(self):
        counter = 0
        self.threadPubDateList = []
        self.threadTitleList = []
        self.threadLinkList = []
        self.threadDescList = []
        self.threadThumbNail = []
        tree=ET.parse('rss.xml')
        root=tree.getroot()
        for channel in root:
            for SubChannel in channel:
                if SubChannel.tag == 'item':
                    for threadInfo in SubChannel:
                        # print(SubChannel.getchildren())
                        if threadInfo.tag == 'title':
                            self.threadTitleList.append(threadInfo.text)
                        if threadInfo.tag == 'pubDate':
                            self.threadPubDateList.append(threadInfo.text[:-6])
                        if threadInfo.tag == 'link':
                            self.threadLinkList.append(threadInfo.text)
                        if threadInfo.tag == 'description':
                            self.threadDescList.append(self.descStripper(threadInfo.text))









                        if threadInfo.tag == '{http://search.yahoo.com/mrss/}title':
                            print(threadInfo.tag)
                            print(threadInfo.attrib)
                            print(threadInfo.get('url'))











        self.lbPopulator(self.threadTitleList,self.threadPubDateList,self.threadLinkList)
        # print(self.threadTitleList)
        # print(self.threadPubDateList)
        # print(self.threadLinkList)
        # print(self.threadDescList)
    def getXML(self,subreddit):
        try:
            url = 'http://www.reddit.com'+subreddit+'.rss'
            source = urllib.request.urlretrieve(url,'rss.xml')
            self.getXmlData()
        except urllib.error.HTTPError as err:
            print('Too many requests-Try again')
    def initial(self):
        try:
            source = urllib.request.urlretrieve('http://www.reddit.com/.rss','rss.xml')
            self.getXmlData()
        except urllib.error.HTTPError as err:
            print('Too many requests-Trying again 3')
            time.sleep(3)
            self.__init__()


# main.geometry("250x150")

app = Application(master=main)
app.mainloop()

Here is a the bit of the code that should be returning a url of thumbnail when passed a XML file. It is all the last 'if' statement and the rest all work fine.

def getXmlData(self):
    counter = 0
    self.threadPubDateList = []
    self.threadTitleList = []
    self.threadLinkList = []
    self.threadDescList = []
    self.threadThumbNail = []
    tree=ET.parse('rss.xml')
    root=tree.getroot()
    for channel in root:
        for SubChannel in channel:
            if SubChannel.tag == 'item':
                for threadInfo in SubChannel:
                    # print(SubChannel.getchildren())
                    if threadInfo.tag == 'title':
                        self.threadTitleList.append(threadInfo.text)
                    if threadInfo.tag == 'pubDate':
                        self.threadPubDateList.append(threadInfo.text[:-6])
                    if threadInfo.tag == 'link':
                        self.threadLinkList.append(threadInfo.text)
                    if threadInfo.tag == 'description':
                        self.threadDescList.append(self.descStripper(threadInfo.text))
                    if threadInfo.tag == '{http://search.yahoo.com/mrss/}title':
                        print(threadInfo.tag)
                        print(threadInfo.attrib)
                        print(threadInfo.get('url'))

You should try posting a specific bit of your code that exemplifies your issue, or create an illustrative example. It would also be helpful to see a sample input, expected output, and actual output. — msnider
– msnider, Commented Dec 11, 2013 at 19:44
That is better. It makes it easier to see what you tried and what went wrong. It would be also good to see an example of the XML element you are parsing... What is the output of the 3 print statements in the final if block? — msnider
– msnider, Commented Dec 11, 2013 at 20:25
If you just run the program it will download the XML. You can find it at reddit.com/.rss Out put is {http://search.yahoo.com/mrss/}title' {} None — ddaniels
– ddaniels, Commented Dec 11, 2013 at 21:12
I believe that it should be returning '{http://search.yahoo.com/mrss/}title' 'url':'(thumbnailURL)' (thumbnailURL) — ddaniels
– ddaniels, Commented Dec 11, 2013 at 21:19
At the beginning of the XML file xmlns:media is declared as "{search.yahoo.com/mrss}". Without substituting it, it never finds the element. — ddaniels
– ddaniels, Commented Dec 11, 2013 at 22:47

msnider · Accepted Answer · 2013-12-11 23:00:56Z

1

The only tag that has an attribute called url is the media:thumbnail tag. And as you pointed out, media is declared at the top with xmlns:media="http://search.yahoo.com/mrss/". This leads me to believe that your last if statement should be:

if threadInfo.tag == '{http://search.yahoo.com/mrss/}thumbnail':
   print(threadInfo.tag)
   print(threadInfo.attrib)
   print(threadInfo.get('url'))

Which should produce the output:

'{http://search.yahoo.com/mrss/}thumbnail'
{'url' : 'http://a.thumbs.redditmedia.com/cozEqqG9muj-tT3Z.jpg'}
'http://a.thumbs.redditmedia.com/cozEqqG9muj-tT3Z.jpg'

edited Dec 11, 2013 at 23:00

answered Dec 11, 2013 at 22:17

msnider

4441 gold badge4 silver badges9 bronze badges

Sign up to request clarification or add additional context in comments.

2 Comments

ddaniels Over a year ago

At the beginning of the xml media is declared as {search.yahoo.com/mrss}. If you change it to the "media:thumbnail" then it never finds the tag.

ddaniels Over a year ago

Yup that works. Didn't remember to change it back to thumbnail. Thank you!

Collectives™ on Stack Overflow

Get retrieve XML attrib using ElementTree Python

1 Answer 1

2 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

2 Comments

Your Answer

Sign up or log in

Post as a guest

Related