Building a HTML scrapper

Hello guys,

for my internship I am trying to build html scrapper using the online google script;

function doGet() {
var html = UrlFetchApp.fetch(‘https://www.booking.com/searchresults.html?label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmaEaIAQGYATHCAQNhYm7IAQzYAQPoAQH4AQKSAgF5qAIE;sid=a964e0b1714370a49d84145ad8e4495f;checkin_month=12&checkin_monthday=1&checkin_year=2017&checkout_month=12&checkout_monthday=2&checkout_year=2017&class_interval=1&dest_id=-2601889&dest_type=city&dtdisc=0&from_sf=1&group_adults=2&group_children=0&inac=0&index_postcard=0&label_click=undef&no_rooms=1&offset=0&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=London&ss_all=0&ssb=empty&sshis=0&ssne=London&ssne_untouched=London&/Document_Object_Model’).getContentText.getContentText)();
var doc = XmlService.parse(html);
var html = doc.getRootElement();
var menu = getElementsByClassName(html, ‘g-hidden’)[0];
var output = ‘’;
var linksInMenu = getElementsByTagName(menu, ‘a’);
for(i in linksInMenu) output+= XmlService.getRawFormat().format(linksInMenu[i])+’<br>’;
return HtmlService.createHtmlOutput(output);
}

function getElementsByClassName(element, classToFind) {
var data = [];
var descendants = element.getDescendants();
descendants.push(element);
for(i in descendants) {
var elt = descendants[i].asElement();
if(elt != null) {
var classes = elt.getAttribute(‘class’);
if(classes != null) {
classes = classes.getValue();
if(classes == classToFind) data.push(elt);
else {
classes = classes.split(’ ');
for(j in classes) {
if(classes[j] == classToFind) {
data.push(elt);
break;
}
}
}
}
}
}
return data;
}

function getElementsByTagName(element, tagName) {
var data = [];
var descendants = element.getDescendants();
for(i in descendants) {
var elt = descendants[i].asElement();
if( elt !=null && elt.getName()== tagName) data.push(elt);
}
return data;
}

it keeps giving me an error on line 3; The entity name must immediately follow the ‘&’ in the entity reference. (line 3, file “Code”)

I´m trying to get it to retrieve the class: get-hidden from the url; https://www.booking.com/searchresults.en-gb.html?aid=356980;label=gen173nr-1FCAEoggJCAlhYSDNiBW5vcmVmaEaIAQGYATHCAQNhYm7IAQzYAQHoAQH4AQKSAgF5qAIE;sid=ce05c159b074b144a3f8247d2de2c259;checkin_month=12&checkin_monthday=1&checkin_year=2017&checkout_month=12&checkout_monthday=2&checkout_year=2017&class_interval=1&dest_id=-2601889&dest_type=city&from_sf=1&group_adults=2&group_children=0&label_click=undef&no_rooms=1&offset=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=London&ssb=empty&ssne=London&ssne_untouched=London&

Can somebody help me fix the code?

thanks a million!

1 Like

Though this code is very useful but it is not working on my t20 world cup site.