1

The following code retrieves 'dossier Urls' for substances in Column A by scraping the ECHA website. I'm trying to error handle cases where a substance Url cannot be found.

I can't quite see why the following code fails. I have highlighted the Problematic line with a comment. this is highlighted in debugging as an Object Required error but I can't see where I'm going wrong.

Sub PopulateExposures()
    Dim url, rw As Range
    
    Set rw = Sheets("data").Range("A2:E2") 'first row with inputs
    Do While Application.CountA(rw) > 0
        url = SubstanceUrl(rw.Cells(1).Value, rw.Cells(2).Value) 'get the URL
        rw.Cells(5).Resize(1, 3).Value = ExposureData(url) 'get exposure data (as array) and add to row
        Set rw = rw.Offset(1, 0) 'next substance
    Loop

End Sub

Public Function SubstanceUrl(SubstanceName, CASNumber) As String
    
    Const url = "https://echa.europa.eu/information-on-chemicals/registered-substances?" & _
                "p_p_id=dissregisteredsubstances_WAR_dissregsubsportlet&p_p_lifecycle=1&" & _
                "p_p_state=normal&p_p_mode=view&" & _
                "__dissregisteredsubstances_WAR_dissregsubsportlet_javax.portlet.action=dissRegisteredSubstancesAction"
    
    Dim oHTML, oHttp, MyDict, payload, DictKey, sep
    
    Set oHTML = New HTMLDocument
    Set oHttp = CreateObject("MSXML2.XMLHTTP")
    Set MyDict = CreateObject("Scripting.Dictionary")
    
    MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_name") = SubstanceName
    MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_cas-number") = CASNumber
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
    
    payload = ""
    For Each DictKey In MyDict
        payload = payload & sep & DictKey & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey))
        sep = "&"
    Next DictKey
        
    With oHttp
        .Open "POST", url, False
        .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
        .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
        .send payload
        oHTML.body.innerHTML = .responseText
    End With
    
    'PROBLEMATIC CODE
    If oHTML.querySelector(".details").getAttribute("href") Is Error Then
    
    SubstanceUrl = "-"
    Else
    
     'Sometimes output changes despite same input
    SubstanceUrl = oHTML.querySelector(".details").getAttribute("href")
    
    End If
    
    
    Debug.Print SubstanceUrl
    
    
    
End Function

Function ExposureData(urlToGet)
    
    Dim XMLReq As New MSXML2.XMLHTTP60
    Dim HTMLDoc As HTMLDocument, dds
    Dim Route(1 To 3) As String, Results(1 To 3) As String, c, Info, Data
    
    Route(1) = "sGeneralPopulationHazardViaInhalationRoute"
    Route(2) = "sGeneralPopulationHazardViaDermalRoute"
    Route(3) = "sGeneralPopulationHazardViaOralRoute"
    
    XMLReq.Open "Get", urlToGet & "/7/1", False
    XMLReq.send
     
    If XMLReq.Status <> 200 Then
        Results(1) = "Problem" & vbNewLine & XMLReq.Status & " - " & XMLReq.statusText
    Else
        Set HTMLDoc = New HTMLDocument
        HTMLDoc.body.innerHTML = XMLReq.responseText
        For c = 1 To UBound(Route, 1)
            Set Info = HTMLDoc.getElementById(Route(c))
            If Not Info Is Nothing Then
                Set Info = Info.NextSibling.NextSibling.NextSibling
                Set dds = Info.getElementsByTagName("dd")
                If dds.Length > 1 Then
                    Results(c) = dds(1).innerText
                Else
                    Results(c) = "hazard unknown"
                End If
            Else
                Results(c) = "no info"
            End If
        Next c
    End If
    
    ExposureData = Results
    
End Function

For this code to run values must be present in column A. Acetone and Benzene can be used respectively to test 2 rows. To test out the error handling Enter something made up like Benzenjaj.

I think this is a quick fix. Just can't see it.

Update:

Tetsing on made up substance name: enter image description here

First 2 results are found as normal but the made-up chemical causes the following error:

enter image description here enter image description here

  • Code:

    Sub PopulateExposures() Dim url, rw As Range

     Set rw = Sheets("data").Range("A2:E2") 'first row with inputs
     Do While Application.CountA(rw) > 0
         url = SubstanceUrl(rw.Cells(1).Value, rw.Cells(2).Value) 'get the URL
         rw.Cells(5).Resize(1, 3).Value = ExposureData(url) 'get exposure data (as array) and add to row
         Set rw = rw.Offset(1, 0) 'next substance
     Loop
    

    End Sub

    Public Function SubstanceUrl(SubstanceName, CASNumber) As String

     Const url = "https://echa.europa.eu/information-on-chemicals/registered-substances?" & _
                 "p_p_id=dissregisteredsubstances_WAR_dissregsubsportlet&p_p_lifecycle=1&" & _
                 "p_p_state=normal&p_p_mode=view&" & _
                 "__dissregisteredsubstances_WAR_dissregsubsportlet_javax.portlet.action=dissRegisteredSubstancesAction"
    
     Dim oHTML, oHttp, MyDict, payload, DictKey, sep
    
     Set oHTML = New HTMLDocument
     Set oHttp = CreateObject("MSXML2.XMLHTTP")
     Set MyDict = CreateObject("Scripting.Dictionary")
    
     MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_name") = SubstanceName
     MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_cas-number") = CASNumber
     MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
     MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
    
     payload = ""
     For Each DictKey In MyDict
         payload = payload & sep & DictKey & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey))
         sep = "&"
     Next DictKey
    
     With oHttp
         .Open "POST", url, False
         .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
         .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
         .send payload
         oHTML.body.innerHTML = .responseText
     End With
    
     On Error Resume Next 'ignore error on following line
     SubstanceUrl = oHTML.querySelector(".details").getAttribute("href")
     On Error GoTo 0      'stop ignoring errors
    
     If Len(SubstanceUrl) = 0 Then SubstanceUrl = "<no URL>"
    

    End Function

    Function ExposureData(urlToGet)

     Dim XMLReq As New MSXML2.XMLHTTP60
     Dim HTMLDoc As HTMLDocument, dds
     Dim Route(1 To 3) As String, Results(1 To 3) As String, c, Info, Data
    
     Route(1) = "sGeneralPopulationHazardViaInhalationRoute"
     Route(2) = "sGeneralPopulationHazardViaDermalRoute"
     Route(3) = "sGeneralPopulationHazardViaOralRoute"
    
     XMLReq.Open "Get", urlToGet & "/7/1", False
     XMLReq.send
    
     If XMLReq.Status <> 200 Then
         Results(1) = "Problem" & vbNewLine & XMLReq.Status & " - " & XMLReq.statusText
     Else
         Set HTMLDoc = New HTMLDocument
         HTMLDoc.body.innerHTML = XMLReq.responseText
         For c = 1 To UBound(Route, 1)
             Set Info = HTMLDoc.getElementById(Route(c))
             If Not Info Is Nothing Then
                 Set Info = Info.NextSibling.NextSibling.NextSibling
                 Set dds = Info.getElementsByTagName("dd")
                 If dds.Length > 1 Then
                     Results(c) = dds(1).innerText
                 Else
                     Results(c) = "hazard unknown"
                 End If
             Else
                 Results(c) = "no info"
             End If
         Next c
     End If
    
     ExposureData = Results
    

    End Function

4
  • The Is operator compares 2 objects. The getAttribute method returns a String which is not an Object. Commented Dec 8, 2021 at 14:00
  • How can I set the output of this string to an object to check if the URL exists or not? Commented Dec 8, 2021 at 14:25
  • @BrianMStafford I dont understand how Info in the bottom module is a string but the Is operator appears to work? Commented Dec 8, 2021 at 15:28
  • Info is not a String. It is a Variant containing an Object as evidenced by the line Set Info =. You only need the Set keyword for Objects. Also, it would be better to define Info as the proper type. Something like Dim Info As IHTMLElement. Commented Dec 8, 2021 at 15:46

2 Answers 2

2

You can just ignore any error:


    With oHttp
        .Open "POST", url, False
        .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
        .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
        .send payload
        oHTML.body.innerHTML = .responseText
    End With
    
    On Error Resume Next 'ignore error on following line
    SubstanceUrl = oHTML.querySelector(".details").getAttribute("href")
    On Error Goto 0      'stop ignoring errors

    If Len(SubstanceUrl) = 0 Then SubstanceUrl = "<no URL>" 
Sign up to request clarification or add additional context in comments.

5 Comments

Hi Tim, this appears to fail if I use a made up substance
Also I would like the cells where a URL isn't found to actually return some character like "-"
"Fail" means what exactly? What does that look like? If the querySelector() fails then it should return "<no URL>"
Please See the update to the question to see images of the problem and the code I have used from your answer.
Before passing the URL to ExposureData you need to check it's not equal to "<no URL>"
1

You can test the .Length of querySelectorAll when looking for the particular registered dossier url. You need to amend your code elsewhere to handle the "-" return url. I prefer Tim's solution in terms of ignoring error however for the first bit.

Option Explicit

Sub PopulateExposures()
    Dim url, rw As Range
    
    Set rw = Sheets("data").Range("A2:E2")       'first row with inputs
    Do While Application.CountA(rw) > 0
        url = SubstanceUrl(rw.Cells(1).Value, rw.Cells(2).Value) 'get the URL
        If Left$(url, 5) = "https" Then
            rw.Cells(5).Resize(1, 3).Value = ExposureData(url) 'get exposure data (as array) and add to row
        Else
            rw.Cells(5).Resize(1, 3).Value = url
        End If
        Set rw = rw.Offset(1, 0)                 'next substance
    Loop

End Sub

Public Function SubstanceUrl(SubstanceName, CASNumber) As String
    
    Const url = "https://echa.europa.eu/information-on-chemicals/registered-substances?" & _
    "p_p_id=dissregisteredsubstances_WAR_dissregsubsportlet&p_p_lifecycle=1&" & _
    "p_p_state=normal&p_p_mode=view&" & _
    "__dissregisteredsubstances_WAR_dissregsubsportlet_javax.portlet.action=dissRegisteredSubstancesAction"
    
    Dim oHTML, oHttp, MyDict, payload, DictKey, sep
    
    Set oHTML = New HTMLDocument
    Set oHttp = CreateObject("MSXML2.XMLHTTP")
    Set MyDict = CreateObject("Scripting.Dictionary")
    
    MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_name") = SubstanceName
    MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_cas-number") = CASNumber
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
    
    payload = ""
    For Each DictKey In MyDict
        payload = payload & sep & DictKey & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey))
        sep = "&"
    Next DictKey
        
    With oHttp
        .Open "POST", url, False
        .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
        .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
        .send payload
        oHTML.body.innerHTML = .responseText
    End With
    
    If oHTML.querySelectorAll("[href*=registered-dossier]").Length = 0 Then
    
        SubstanceUrl = "-"
        
    Else
    
        'Sometimes output changes despite same input
        SubstanceUrl = oHTML.querySelector(".details")
    
    End If
    
    
    Debug.Print SubstanceUrl
      
End Function

Function ExposureData(urlToGet)
    
    Dim XMLReq As New MSXML2.XMLHTTP60
    Dim HTMLDoc As HTMLDocument, dds
    Dim Route(1 To 3) As String, Results(1 To 3) As String, c, Info, Data
    
    Route(1) = "sGeneralPopulationHazardViaInhalationRoute"
    Route(2) = "sGeneralPopulationHazardViaDermalRoute"
    Route(3) = "sGeneralPopulationHazardViaOralRoute"
    
    XMLReq.Open "Get", urlToGet & "/7/1", False
    XMLReq.send
     
    If XMLReq.Status <> 200 Then
        Results(1) = "Problem" & vbNewLine & XMLReq.Status & " - " & XMLReq.statusText
    Else
        Set HTMLDoc = New HTMLDocument
        HTMLDoc.body.innerHTML = XMLReq.responseText
        For c = 1 To UBound(Route, 1)
            Set Info = HTMLDoc.getElementById(Route(c))
            If Not Info Is Nothing Then
                Set Info = Info.NextSibling.NextSibling.NextSibling
                Set dds = Info.getElementsByTagName("dd")
                If dds.Length > 1 Then
                    Results(c) = dds(1).innerText
                Else
                    Results(c) = "hazard unknown"
                End If
            Else
                Results(c) = "no info"
            End If
        Next c
    End If
    
    ExposureData = Results
    
End Function

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.