0

I want to create a table where the ff. will show but I got some problems

    public class Book
    {
        public HtmlAttribute Href{ get; set; }
        public string Title{ get; set; }
        public string Author{ get; set; }
        public string Characters{ get; set; }
    }

This is the page I am trying to parse, I need the href value, the link, the description and the character list (sometimes there is none):

    <div id=title> 
        <li>
            <h3><a href="www.harrypotter.com">Harry Potter</a></h3>
            <div>Harry James Potter is the title character of J. K. Rowling's Harry Potter series. </div>
            <ul>
                <li>Harry Potter</li>
                <li>Hermione Granger</li>
                <li>Ron Weasley</li>
            </ul>
        </li>

        <li>
            <h3><a href="www.littleprince.com">Little Prince</a></h3>
            <div>A little girl lives in a very grown-up world with her mother, who tries to prepare her for it.  </div>
        </li>
    </div>

And this is my code to parse it and put it in a list

    List<Book> BookList= new List<Book>();
    var titleNode = doc.DocumentNode.SelectNodes("//*[@id=\"title\"]//li//h3");
    var descNode = doc.DocumentNode.SelectNodes("//*[@id=\"title\"]//li//div");
    var authorNode = doc.DocumentNode.SelectNodes("//*[@id=\"title\"]//li//ul");

    var title = titleNode.Select(node => node.InnerText).ToList();
    var desc = descNode.Select(node => node.InnerText).ToList();
    var characters= authorNode.Select(node => node.InnerText).ToList();

    for (int i = 0; i < Title.Count(); ++i)
    {
        var list= new Book();
        list.Title= title[i];
        list.Author= desc[i];
        list.Characters = characters[i];
        BookList.Add(list);
    }

My questions are: 1) How will I get the href value and add it in the list? 2) Some have no tag for characters in the html, how can I get the list without an NullReferenceException error? Note: I can't make any changes in the html.

2 Answers 2

0

I have solve your problem without using HTMLAgilityPack, Here I am using System.Xml

Note: You should add some unique values to identify Main li element, Here I have added Class as 'Main'

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;

namespace Test
{
public class Book
{
    public string Href { get; set; }
    public string Title { get; set; }
    public string Author { get; set; }
    public string Characters { get; set; }
}

class Program
{
    static void Main(string[] args)
    {
        string str="<div id='title'><li class='Main'><h3><a href='www.harrypotter.com'>Harry Potter</a></h3><div>Harry James Potter is the title character of J. K. Rowling's Harry Potter series. </div>";
        str += "<ul><li>Harry Potter</li><li>Hermione Granger</li><li>Ron Weasley</li></ul></li><li class='Main'><h3><a href='www.littleprince.com'>Little Prince</a></h3><div>A little girl lives in a very grown-up world with her mother, who tries to prepare her for it.  </div></li></div>";

        XmlDocument doc = new XmlDocument();
        doc.LoadXml(str);

        XmlNodeList xnList= doc.SelectNodes("//*[@id=\"title\"]//li[@class=\"Main\"]");

        List<Book> BookList=new List<Book>();

        for (int i = 0; i < xnList.Count; i++)
        {
            XmlNode TitleNode = xnList[i].SelectSingleNode("h3");
            XmlNode DescNode = xnList[i].SelectSingleNode("div");
            XmlNode AuthorNode = xnList[i].SelectSingleNode("ul");

            Book list = new Book();
            if(TitleNode!=null)
                list.Title=TitleNode.InnerText;
            else
                list.Title="";

            if (DescNode != null)
                list.Author = DescNode.InnerText;
            else
                list.Author = string.Empty;

            if (AuthorNode != null)
                list.Characters = AuthorNode.InnerText;
            else
                list.Characters = string.Empty;

            if (TitleNode != null && TitleNode.ChildNodes.Count>0)
            {
                XmlNode HrefNode = TitleNode.ChildNodes[0];
                if (HrefNode != null && HrefNode.Attributes.Count > 0 && HrefNode.Attributes["href"] != null)
                    list.Href = HrefNode.Attributes["href"].Value;
                else
                    list.Href = string.Empty;
            }
            else
            {
                list.Href = string.Empty;
            }

            BookList.Add(list);
        }
    }
}
}
Sign up to request clarification or add additional context in comments.

1 Comment

I can't modify the html because it is from the website I am trying to parse.
0

This is how I would do. Let me know your questions so I could help.

        //get all li(s)
        var lis = doc.DocumentNode.Descendants("li").Where(_ => _.ParentNode.Id.Equals("title"));
        foreach (var li in lis)
        {
            //get tile and href
            var title = li.Descendants("h3").FirstOrDefault().InnerText; //you can check null or empty here
            var href = li.Descendants("h3").FirstOrDefault(_ => _.Name.Equals("a"))?.Attributes["href"]; //again check null here
            var desc = li.Descendants("div").FirstOrDefault().InnerHtml;
            var characters = li.Descendants("ul").FirstOrDefault()?.Descendants("li");
            foreach (var character in characters)
            {
                var val = character.InnerText;
            }
        }

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.