CasperJS scraping assistance required -


I am trying to get the 'title' and 'author' for each thesis from each link. So far, I have this (my issues with whom I need help with the comments inside the code):

  var utils = require (utils'); Var casper = require ('casper') Creating (verbose: true, logleval: 'error', page settings: {loadImages: false, loadPlugins: false, userAgent: 'Mozilla / 5.0 (Windows NT 6.2; WOW64) AppleWebKit / 537.36 (KHTML, like Geico) Chrome / 29.0 Var link: [1547.2 Safari / 537.36 '}, client password: [' lib / jquery.min.js']}); Var i = 0; Var link = []; var thesis_data = []; function getThesisLinks () {var link = Document.querySelectorAll (''); // Not sure what should go ('') return [] .map.call (link, action) (link); return link.getAttribute ('href');}); Function loopThusThesisLinks () {// until all links are processed (i & lt; links.length) {this .echo ('[LINK #' + + + ']' + link [i]) iterations ; GetThesisData.call (this, link [i]); i ++; this.run (loopThroughThesisLinks);} and {utils.dump}; this.exit ();}} function getThesisData (link) {this Get the title of .start (link, function () {// thesis - it is not sure what the element will be included for it. FetchText var title = this.fetchText (''); // Get the name of authors - this Not sure which element to be included for this to be included. Text = text.text var Author = this.fetchText (''); // thesis_data array var data = {title: title, author: author}; Thesis_data.push (data);}); } Casper.start ('http://ses.library.usyd.edu.au/handle/2123/345/browse?type=dateissued&sort_by=2&order=DESC&rpp=1495&etal=0&submit_browse= (updates) ), Relative links for full URL for functions () {links = this.evaluate (getThesisLinks); // (var i = 0; i & lt; links.length; i ++) {links [i] = " http: /ses.library.usyd.edu.au/handle/ "+ Link [i];} utils.dump (link);}); Casper.run (loopThroughThesisLinks);   

Any help will be appreciated.

This is a simple CSS selector for all links:

  Var links = document.querySelectorAll ('table.misctable & gt; tbody & gt; tr & gt; td: nth-of-type (3) & gt; a');   

You can also use XPath like this:

  var x = Required ('Casper'). SelectXPath; // var var title = the.fetchText (x ('// table // tr / td [1] [in the file (text (),' title: ")] /../td goes to the beginning of [2] I guess you can find  writers  -query I could possibly crawling differently in the  casper. ThenOpen , because it is difficult to read with the extra  start  and  run . Call is in various functions.  

with casper.thenOpen it will look like this:

  var x = require ('casper'). SelectXPath; //f The eL function loopThroughShesisLinks () {// goes into the beginning of the recurses until all links are processed (i & lt; links.length) {this.echo ('[LINK #' + i + ']' + Link [I]); GetThesisData.call (this, link [i]); i ++; .to (loopThroughThesisLinks);} and {utils.dump}; this.exit ();}} function getThesisData (link) ) {This.thenOpen (link, function) {var title = this.fetchText (x (// // // // in td [1] [[text (), "title:"]] /../ td [2] ')); var author = this.fetchText (x // '// t / td [1] [in it (text (), "authors:")] /./td[2]')); // thesis_data array var data = add title and author data {{title: title, author: author}; Thesis_data.push (data);}); }    

Comments

Popular posts from this blog

Java - Error: no suitable method found for add(int, java.lang.String) -

java - JPA TypedQuery: Parameter value element did not match expected type -

c++ - static template member variable has internal linkage but is not defined -