0

I have been trying to parse a very simple HTML response but without success so far.

The HTTP POST request seems to work as required but then no results.

Public Sub parsehtml()

Dim http As Object, html As New HTMLDocument, vessels As Object, titleElem As Object, detailsElem As Object, vessel As HTMLHtmlElement
Dim i As Integer

Set http = CreateObject("MSXML2.XMLHTTP")
http.Open "POST", "http://www.medmouic.org/Home/Trouver", False
http.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
http.send "imonumber=&val=0&Name=&selectFlag=333&selectType=&date1=01.08.2019&date2=31.08.2019"
html.body.innerHTML = http.responseText
Debug.Print http.responseText
Set vessels = html.getElementsByTagName("tbody")

i = 2
    For Each vessel In vessels
        Set titleElem = vessel.getElementsByClassName("tr")(0)
            Folha1.Cells(i, 1).Value = titleElem.getElementsByTagName("td")(0).innerText
            Folha1.Cells(i, 2).Value = titleElem.getElementsByTagName("span")(0).href
            Folha1.Cells(i, 3).Value = titleElem.getElementsByTagName("span")(0).innerText
            'Sheets(1).Cells(i, 4).Value = detailsElem.getElementsByTagName("a")(0).innerText
    i = i + 1
Next

End Sub

And the HTML looks like this:

<table class="data-table" id="anyTable" >
.......
        <tbody>
            <tr>
                <td class="sub">
                    9301433

                </td>
                <td class="sub">

                   BOMAR CAEN
                </td>
                <td class="sub">
                    Portugal
                </td>
                <td class="sub">
                    Containership
                </td>
                <td class="sub">

                    <span class="label label label-success"  alt="show details " title="show details ">
                        <a href="/Home/AfficherRapport?rapportID=84883" style="color:white; background-color:#58D3F7">30.08.2019</a>
                    </span>
                </td>
                <td class="sub"> Malta</td>
            </tr>   
        </tbody>

The objective is to get the innertext inside all those "SUB" classes.

1
  • Have you tried to see what getElementsByClassName("sub") gave you? Commented Nov 18, 2019 at 17:17

1 Answer 1

2

The HTML contains flaws which is why I think the html parser is falling over.

Ideally, you would have access to the source and fix it. Otherwise, you could use regex to fix it but as table html itself is ok you can regex that out (note I wouldn't normally recommend regex with html but I am using it on a string). I use clipboard to write out table.

Public Sub WriteOutShipInspectionTable()
    Dim http As Object, s As String

    Set http = CreateObject("MSXML2.XMLHTTP")

    With http
        .Open "POST", "http://www.medmouic.org/Home/Trouver", False
        .setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
        .send "imonumber=&val=0&Name=&selectFlag=333&selectType=&date1=01.08.2019&date2=31.08.2019"
        s = .responseText
    End With

    Dim clipboard As Object, re As Object

    Set re = CreateObject("VBScript.RegExp")
    Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")

    With re
        .Global = True
        .MultiLine = True
        .Pattern = "<table[\s\S]*?<\/table>"
        clipboard.SetText .Execute(s)(0)
    End With

    clipboard.PutInClipboard
    ThisWorkbook.Worksheets("Sheet1").Range("A1").PasteSpecial
End Sub

Regex:

enter image description here


All results:

If you examine the javascript for the page construction, and the ajax calls to update the table, they are designed for working with 10 records. To get all results you can regex out the total number of inspections and issue requests to retrieve all records in batches of 10. There's room for a little more re-factoring in the code below but it is good enough.

Option Explicit

Public Sub WriteOutShipInspectionTable()
    Dim http As Object, s As String, ws As Worksheet, re As Object

    Set http = CreateObject("MSXML2.XMLHTTP")
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    Set re = CreateObject("VBScript.RegExp")

    Dim html As HTMLDocument, body As String, headers(), startDate As String, endDate As String

    startDate = "01.08.2018"
    endDate = "31.08.2019"
    headers = Array("IMO Number", "Ship Name", "Flag state", "Ship Type", "Date of inspection", "Place of inspection")
    Set html = New MSHTML.HTMLDocument

    With re
        .Global = True
        .MultiLine = True
    End With

    With http
        .Open "POST", "http://www.medmouic.org/Home/Trouver", False
        .setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
        .send "imonumber=&val=0&Name=&selectFlag=333&selectType=&date1=" & startDate & "&date2=" & endDate
        s = .responseText
        html.body.innerHTML = GetString(re, s, "(<table[\s\S]*?<\/table>)")

        Dim totalInspections As Long, results(), r As Long, offset As Long

        totalInspections = CLng(GetString(re, s, "'anyDiv', '(\d+)'"))

        ReDim results(1 To totalInspections, 1 To UBound(headers) + 1)

        results = PopulateArray(html, r, results)

        For offset = 10 To totalInspections Step 10
            .Open "POST", "http://www.medmouic.org/Home/Trouver", False
            .setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
            .send "imonumber=&val=" & CStr(offset) & "&Name=&selectFlag=333&selectType=&date1=" & startDate & "&date2=" & endDate
            s = .responseText
            html.body.innerHTML = GetString(re, s, "(<table[\s\S]*?<\/table>)")
            results = PopulateArray(html, r, results)
        Next
    End With

    With ws
        .Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
        .Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
    End With
End Sub

Public Function PopulateArray(ByVal html As MSHTML.HTMLDocument, ByRef r As Long, ByRef results As Variant) As Variant
    Dim c As Long, tr As MSHTML.HTMLTableRow, td As MSHTML.HTMLTableCell, i As Long

    For i = 1 To html.querySelectorAll("tr").Length - 1
        r = r + 1: c = 1
        For Each td In html.querySelectorAll("tr").Item(i).getElementsByTagName("td")
            results(r, c) = td.innerText
            c = c + 1
        Next
    Next
    PopulateArray = results
End Function

Public Function GetString(ByVal re As Object, ByVal s As String, ByVal p As String) As String
    With re
        .Pattern = p
        GetString = .Execute(s)(0).submatches(0)
    End With
End Function

References (VBE>Tools>References):

  1. Microsoft HTML Object Library

Efficiency:

You might consider switching languages to something more efficient where you have libraries to concatenate result sets and that allow for html session whereby you can re-use the underlying tcp connection (you would have to implement your own COM version for VBA as it is not natively available). Here is a python example. The 'lxml' parser used is more forgiving so you can select the table by its id:

import requests, math
from bs4 import BeautifulSoup as bs
import pandas as pd

data = {'imonumber': '', 'Name': '', 'selectFlag': '333','selectType': ''
        , 'date1': '01.01.2018', 'date2': '31.10.2019','val': 0}

def get_table(s, data):
    soup = bs(data, 'lxml')
    df = pd.read_html(str(soup.select_one('#anyTable')))[0]
    return df

with requests.Session() as s:
    r = s.post('http://www.medmouic.org/Home/Trouver', data=data)
    df = get_table(s, r.content)
    total_records = int(re.search(r"'anyDiv', '(\d+)'",r.text).group(1))

    for page in range(10, total_records + 1, 10):
        data['val'] = page
        r = s.post('http://www.medmouic.org/Home/Trouver', data=data)
        df = pd.concat([df,get_table(s, r.content)])

df = df.reset_index(drop=True)
print(df)
Sign up to request clarification or add additional context in comments.

1 Comment

Comments are not for extended discussion; this conversation has been moved to chat.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.