1

I have an HTML saved in a cell on the google sheet. Now I would like to extract element values from it. Can anyone please guide?

Here is the sample HTML that I am working with:

<div class="test"><a href="/this-is-page-url" class="cc_a_a"><div data-react-toolbox="card" class="new_test"><div style="background-image:url(&#x27;https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png&#x27;)" class="new_class" title="this is image"><div class="last"></div></div><div class="new_2"><div class="title_test"><div class="card_title">Title Goes Here</div></div></div><div class="for_text"><p>test goes here</p></div><div class="for_date"><p>Jan 1, 2020</p></div></div></a></div>

I would like to extract:

  • a href src value
  • image background url
  • Title
  • Text
  • Date (another text)

Sample code that I am trying to extract href value. No idea how I can do other element unfortunately.

var variable_for_cell_with_HTML = "MY_HTML_GOES_HERE_FROM_ABOVE";
 var myurl = variable_for_cell_with_HTML;
var doc = document.createElement("html");
doc.innerHTML = rawHTML;
var links = doc.getElementsByTagName("a")
var urls = [];

for (var i=0; i<links.length; i++) {

  SpreadsheetApp.getActive().getSheetByName('mysheet').getRange('B7').setValue(urls.push(links[i].getAttribute("href")));
}

Getting ERROR

ReferenceError: document is not defined

2 Answers 2

2

If you're trying to extract specific HTML elements from a given URL, you can follow this general format:

=importxml(A8,"//div[@class='class of desired div']//h3[@class='class of desired h3 element']")

Where A8 is a cell with the web link to the HTML, and where the div or h3 are the tags encompassing your desired result from the page. This is just one example extracting a specific h3 from a specific div, but you could leave off the [@class==] stuff to just return all the h3 elements within the prior div.

I'm sure this could be applied to your specific case as well.

Sign up to request clarification or add additional context in comments.

Comments

0

It's only html when after it's loaded into the browser. Before that it's just a string. Use standard javascript string methods

something like this regex will get you close to the href: url:

/href="([^"]{1,})"/g 

this is will get you close to the background url:

https:\/\/[^&]{1,}

Regex Tester

This is the html file for my regex tester. I wrote it a long time ago so it's probably a bit neophyte....ish?

<!DOCTYPE html>
<html>
  <head>
    <base target="_top">
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
  </head>
  <script>
  $(function(){
    google.script.run
    .withSuccessHandler(function(rObj){
      $('#text').val(rObj.text);
      $('#pattern').val(rObj.pattern);
      $('#results').css('background','white');
      if(rObj.g.toLowerCase()=='yes'){$('#set_g').prop('checked',true);}else{$('#set_g').prop('checked',false);}
      if(rObj.i.toLowerCase()=='yes'){$('#set_i').prop('checked',true);}else{$('#set_i').prop('checked',false);}
      if(rObj.m.toLowerCase()=='yes'){$('#set_m').prop('checked',true);}else{$('#set_m').prop('checked',false);}
    })
    .getLastTextPatternFlags();
  });

  function findData(){
    $('#results').css('background','yellow');
    $('#results').val('');
    var text=$('#text').val();
    var pattern=$('#pattern').val();
    var flags=getFlags();
    try{
      var regex=new RegExp(pattern,flags);
    }
    catch(e){
      console.error(e);
      $('#results').css('background','white');//This is test very much you should come back and look at this.
      $('#results').val('Check Error in Console Log');
    }
    //var result=regex.exec(text);
    result=text.match(regex);

    if(result){
      var rsltLog='';
      for(var i=0;i<result.length;i++){
        if(i>0){rsltLog+='\n'};
        rsltLog+='result[' + i + ']= ' + result[i];
      }
    }
    console.log('module: %s pattern: %s regex: %s flags: %s result: %s length: %s','findData()',pattern,regex,flags,rsltLog,result.length);
    try{
      if(result){
        $('#results').val(rsltLog);
      }else{
        $('#results').val("No Results");
      }
    }
      catch(e){
        console.error(e);
      }
      $('#results').css('background','white');
    }   

    function getFlags(){
      var g=$('#set_g').is(':checked');
      var i=$('#set_i').is(':checked');
      var m=$('#set_m').is(':checked');
      var flagsA=[];
      if(g){flagsA.push('g');}
      if(i){flagsA.push('i');}
      if(m){flagsA.push('m');}
      return flagsA.join('');
    }

    function saveText(){
      $('#text').css('background','yellow');
      var txt=$('#text').val();
      google.script.run
      .withSuccessHandler(function(){
        $('#text').css('background','white');
      })
      .saveText(txt);
      }

      function savePattern(){
      $('#pattern').css('background','yellow');
      var txt=$('#pattern').val();
      google.script.run
      .withSuccessHandler(function(){
        $('#pattern').css('background','white');
      })
      .savePattern(txt);
      }

      function saveFlags(){
        $('#results').css('background','yellow');
        var g=$('#set_g').is(':checked');
        var i=$('#set_i').is(':checked');
        var m=$('#set_m').is(':checked');
        var flagObj={g:'no',i:'no',m:'no'};
        if(g){flagObj.g='yes';}
        if(i){flagObj.i='yes';}
        if(m){flagObj.m='yes';}
        google.script.run
        .withSuccessHandler(function(){
          $('#results').css('background','white');
        })
        .saveFlags(flagObj);
      }
      console.log('My Code');
    </script>
    <style>
    .btns{margin:2px 2px 2px 0;}
    #container{width:100%;}
    </style>
  <body>
    <div id='container'>
    TEXT&nbsp;&nbsp;<input class="btns" type="button" value="Save Text" onClick="saveText();" />
    <br /><textarea id="text" placeholder="Enter the text to be searched" rows="4" cols="60"></textarea>
    <br />PATTERN&nbsp;&nbsp;<input class="btns" type="button" value="Save Pattern" onClick="savePattern();" />
    <br /><textarea id="pattern" placeholder="Enter the regex search expression" rows="4" cols="60"></textarea>
    <br />RESULTS
    <br /><textarea id="results" rows="4" cols="60"></textarea>
    <br /><input type="button" value="Search" onClick="findData();" />&nbsp;&nbsp;<input class="hostcontrol" type="button" value="Close" onClick="google.script.host.close();" />
    &nbsp;&nbsp;g&nbsp;&nbsp;<input id="set_g" type="checkbox" />
    &nbsp;&nbsp;i&nbsp;&nbsp;<input id="set_i" type="checkbox" />
    &nbsp;&nbsp;m&nbsp;&nbsp;<input id="set_m" type="checkbox" />
    &nbsp;&nbsp;<input type="button" value="Save Flags" onClick="saveFlags();" />
    &nbsp;&nbsp;<p>Don't leave extra carriage returns in search pattern textbox.</p>
    </div>
  </body>
</html>

And this is the GS code for it:

function onOpen(){
  SpreadsheetApp.getUi().createMenu('My Tools')
  .addItem('Regex Tool', 'showRegexDialog')
  .addToUi();
}

function showRegexDialog(){
  var ui=HtmlService.createHtmlOutputFromFile('RegexTester').setWidth(800).setHeight(500);
  SpreadsheetApp.getUi().showModelessDialog(ui, 'Regex Tester');
}

function getLastTextPatternFlags(){
  var ss=SpreadsheetApp.getActive();
  var sh=ss.getSheetByName('Input');
  var rg=sh.getDataRange();
  var vA=rg.getValues();
  var rObj={};
  for(var i=0;i<vA.length;i++){
    rObj[vA[i][0]]=vA[i][1];
  }
  Logger.log(rObj);
  return rObj;
}

function saveText(txt){
  var ss=SpreadsheetApp.getActive();
  var sh=ss.getSheetByName('Input');
  var rg=sh.getDataRange();
  var vA=rg.getValues();
  for(var i=0;i<vA.length;i++){
    if(vA[i][0]=='text'){
      vA[i][1]=txt;
    }
  }
  rg.setValues(vA);
  return true;
}

function savePattern(txt){
  var ss=SpreadsheetApp.getActive();
  var sh=ss.getSheetByName('Input');
  var rg=sh.getDataRange();
  var vA=rg.getValues();
  for(var i=0;i<vA.length;i++){
    if(vA[i][0]=='pattern'){
      vA[i][1]=txt;
    }
  }
  rg.setValues(vA);
  return true;
}

function saveFlags(flagObj){
  var ss=SpreadsheetApp.getActive();
  var sh=ss.getSheetByName('Input');
  var rg=sh.getDataRange();
  var vA=rg.getValues();
  for(var i=0;i<vA.length;i++){
    var n=String(vA[i][0]).toLowerCase();
    if(n=='g' || n=='i' || n=='m'){
      vA[i][1]=flagObj[n];
    }
  }
  rg.setValues(vA);
  return true;
}


function doGet(){
  var output=HtmlService.createHtmlOutputFromFile('RegexTester');
  output.append('<style>.hostcontrol{display:none;}</style>');
  return output.setXFrameOptionsMode(HtmlService.XFrameOptionsMode.ALLOWALL);
}

Pour it into a dialog and play with it..

8 Comments

Hi @Cooper - would you please show me an example? As I have posted the code above what I am trying. I am open to try new approach / code (if my approach is not correct). As I have mentioned in my question, I am looking for more than just href value and I am not sue how to get the values in google script.
Just use Javascript string match and the above regular expressions
The code your using is for javascript inside of a browser after the document has loaded. There is no document in a spreadsheet.
Admitedly my regex definitely needs some adjustment. But I'd be happy with a little regex and some slicing myself.
Hi @Cooper first of all, thank you for sharing the code. Since in my use-case, I do not have a separate HTML file and have HTML code saved in a cell on the google sheet (HTML that I shared). Do you have any suggestion on how to process and how can I extract various element values? Any help with the code would be appreciated.
|

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.