ramajs

web browser automation from your browser console
git clone https://tongong.net/git/ramajs.git
Log | Files | Refs | README | LICENSE

crawler.js (3305B)


      1 // this script crawls through all connected pages on the same domain as it
      2 // started and generates one array of all the pages and one of all referenced
      3 // pages on other domains
      4 
      5 // run in the console after loading rama.js
      6 
      7 // links pointing to a different domain
      8 externalLinks = [];
      9 // links on the same domain, which were already crawled
     10 internalCrawled = [];
     11 // links on the same domain, which have to be crawled
     12 internalNotCrawled = [];
     13 
     14 // returns the url of a location object without the URL fragment
     15 // (https://en.wikipedia.org/wiki/URI_fragment)
     16 // -> no page has to be loaded twice
     17 function urlf(location) {
     18     return location.href.replace(location.hash, "");
     19 }
     20 
     21 // Anonymous function wrapper needed for await to work
     22 (async () => {
     23     // initialize the rama.js client
     24     rama.clearpage();
     25     c = rama.new();
     26 
     27     await c.waitForReload();
     28     internalNotCrawled.push(c.w.location.href);
     29 
     30     while (internalNotCrawled.length > 0) {
     31         // visit the next page
     32         let newUrl = internalNotCrawled.shift();
     33         internalCrawled.push(newUrl);
     34         c.w.location = newUrl;
     35         console.log("visiting " + newUrl);
     36         console.log(
     37             "internal: " +
     38                 internalCrawled.length +
     39                 " crawled and " +
     40                 internalNotCrawled.length +
     41                 " remaining. external: " +
     42                 externalLinks.length
     43         );
     44         await c.waitForReload();
     45 
     46         // get all <a> tags
     47         links = c.d.qsa("a");
     48 
     49         // filter out all urls of filetypes that are not websites
     50         links = links.filter((l) => {
     51             let parts = l.pathname.split(".");
     52 
     53             // just a normal path like /examplefolder/examplesite
     54             if (parts.length == 1) return true;
     55             else {
     56                 // file with extension like /examplefolder/examplefile.txt or a
     57                 // weird path like /example.folder/examplesite or
     58                 // /example/.folder/file.txt
     59                 let extension = parts[parts.length - 1];
     60 
     61                 // it is actually no extension but some weird path like
     62                 // /example.folder/examplesite
     63                 if (extension.includes("/")) return true;
     64 
     65                 // just a website with extension
     66                 if (
     67                     ["html", "htm", "php", "asp", "aspx", "shtml"].includes(
     68                         extension
     69                     )
     70                 )
     71                     return true;
     72 
     73                 // it has to be an evil file
     74                 return false;
     75             }
     76         });
     77 
     78         // add the links to the lists if they are not already there
     79         links.forEach((l) => {
     80             if (l.hostname != c.w.location.hostname) {
     81                 // external link
     82                 // push to list if not already there
     83                 if (!externalLinks.includes(urlf(l)))
     84                     externalLinks.push(urlf(l));
     85             } else {
     86                 if (
     87                     !internalCrawled.includes(urlf(l)) &&
     88                     !internalNotCrawled.includes(urlf(l))
     89                 ) {
     90                     // internal link, which is not already part of the lists
     91                     internalNotCrawled.push(urlf(l));
     92                 }
     93             }
     94         });
     95     }
     96     console.log("finished crawling");
     97 })();