Web scraping with Node.js, jQuery and MongoDB

After the discontinuiton of Kimono, I was looking last week into a easy way to parse html on a node.js server. After trying out a few libraries, I decided to use Cheerio, a powerful library base on jQuery, for its power and simplicity of use.

The following code is a tiny node app which scrape public domains quotes from Goodreads to demonstrate how to use it:

var request = require('request');
var cheerio = require('cheerio');
var Entities = require('html-entities').AllHtmlEntities;

var db = require('./db'); //get the mongoose database
var AuthorModel = require('./model/author'); //get the list of authors to scrape
var QuoteModel = require('./model/quote'); //create each quote in the database using the model


/*** PARSER FUNCTIONS ***/

entities = new Entities();

//Get the list of authors from the database, loop through each page of quotes for each of them 
function getAllQuotesFromAuthors() {
    
    AuthorModel.findAll(function(error, authors){
        
        authors.forEach(function(author,i,arr)
        {
            var cnt = i;
            var authorName = author.name;
            var authorCat = author.category;
            var authorImg = author.imageName;
            getQuotesForUrl(author.quotesBaseUrl,author.pages,function(quotes){
                
                //console.log(quotes);
                //save the quotes in the database
                quotes.forEach(function(quote, index, arr){
                        var version = 0;
                        var entry  = new QuoteModel();
                        entry.author = authorName;
                        entry.text = quote.text;
                        entry.category = authorCat;
                        entry.tags = quote.tags;
                        entry.bgImageName = authorImg;
                        entry.version = version;
                        entry.save();         
                });
                
                console.log('Count: '+cnt+', Name: ' +authorName + ', Quotes lengths: ' + quotes.length);
                console.log('----------------------------------');
                
            }) 
        });
        
    });
}
 
//Loop through pages of quotes for an author
function getQuotesForUrl(url,pages,callback){
  var allQuotes = [];
  var req = 0; //number of requests currently running
  for(var i=1;i<=pages;i++)
  {
    var pageUrl = url+"?page="+i;
    req++;
    extractDataFrom(pageUrl,function(quotes){
      allQuotes = allQuotes.concat(quotes);
      if(--req == 0) //when all the operations termintates, we decrease requests and compare it to
      {
        callback(allQuotes);
      }
    });
  }

}

//Extract the data from one page
function extractDataFrom(url,callback){
  request(url, function (error, response, html) {
    if (!error && response.statusCode == 200) {
      var quotes = parseHtml(html);
      callback(quotes);
    }
  });
};

//Parse the html using cheerio to extract the quotes of this page
function parseHtml(html){
  var quotes = [];
  var $ = cheerio.load(html);
  $('div.quote').each(function(i, quoteBlock){
    var text = "";
    var tags = [];
    $(quoteBlock).find('.quoteText').each(function(i, quoteText){
      var rawText = $(quoteText).html();
      var res = rawText.split("<br>  &#x2015;");
      if(res == null || res.length<2)
      {
        console.log(res);  
        console.log('Impossible to find divisor "<br>  &#x2015;" in html: ');
        console.log(rawText);
        console.log('----------------------------------');
      }
        else
        {
         /* Here we can use the built in functions of cheerio to remove the html tags not needed
          var quote = cheerio.load(rawText);
          quote('a').remove();
          quote('br').remove();
          quote('script').remove();
          quote('span').remove();
         */
         /*Or use regular expression to clean the quotes*/
          tmpText = res[0];
          tmpText=tmpText.trim();
          tmpText=tmpText.replace(/<br\/>/g,"<br>");
          tmpText=tmpText.replace(/\n/g,"<br>");
          var re = /^.*&#x201C;(.*)&#x201D;.*$/i;
          var res = tmpText.match(re);
          if (res != null && res.length>1)
          {
            tmpText = res[1];
            tmpText=tmpText.replace(/<br>/g,"\n");
            tmpText=tmpText.replace(/<[^<>]+\?>/g,"");
            text = entities.decode(tmpText);  
            //console.log(text);
            //console.log('+++++++++++++++++++++++++++++++++++');
          }
          else
          {
            console.log('Impossible to find quote in html: ');
            console.log(tmpText);
            console.log('----------------------------------');
          }
        }
    });

    $(quoteBlock).find('.quoteFooter .left a').each(function(i, quoteTag){
      var rawTag = $(quoteTag).text();
      tags.push(rawTag);

    });

    quotes.push({
      text:text,
      tags:tags
    });

  });

  return quotes;
}

And the Author Model in Mongoose:

var db = require('../db');
var Schema = db.Schema;

var AuthorSchema = new Schema({
  name: String,
  category: String,
  quotesBaseUrl: String, //first page of quotes, for ex: https://www.goodreads.com/author/quotes/2167493.Gautama_Buddha
  imageUrl: String, //profile picture url
  pages: Number //number of pages to scrape
});

var AuthorModel = db.model('Author', AuthorSchema)