crawler.js (3305B)
1 // this script crawls through all connected pages on the same domain as it 2 // started and generates one array of all the pages and one of all referenced 3 // pages on other domains 4 5 // run in the console after loading rama.js 6 7 // links pointing to a different domain 8 externalLinks = []; 9 // links on the same domain, which were already crawled 10 internalCrawled = []; 11 // links on the same domain, which have to be crawled 12 internalNotCrawled = []; 13 14 // returns the url of a location object without the URL fragment 15 // (https://en.wikipedia.org/wiki/URI_fragment) 16 // -> no page has to be loaded twice 17 function urlf(location) { 18 return location.href.replace(location.hash, ""); 19 } 20 21 // Anonymous function wrapper needed for await to work 22 (async () => { 23 // initialize the rama.js client 24 rama.clearpage(); 25 c = rama.new(); 26 27 await c.waitForReload(); 28 internalNotCrawled.push(c.w.location.href); 29 30 while (internalNotCrawled.length > 0) { 31 // visit the next page 32 let newUrl = internalNotCrawled.shift(); 33 internalCrawled.push(newUrl); 34 c.w.location = newUrl; 35 console.log("visiting " + newUrl); 36 console.log( 37 "internal: " + 38 internalCrawled.length + 39 " crawled and " + 40 internalNotCrawled.length + 41 " remaining. external: " + 42 externalLinks.length 43 ); 44 await c.waitForReload(); 45 46 // get all <a> tags 47 links = c.d.qsa("a"); 48 49 // filter out all urls of filetypes that are not websites 50 links = links.filter((l) => { 51 let parts = l.pathname.split("."); 52 53 // just a normal path like /examplefolder/examplesite 54 if (parts.length == 1) return true; 55 else { 56 // file with extension like /examplefolder/examplefile.txt or a 57 // weird path like /example.folder/examplesite or 58 // /example/.folder/file.txt 59 let extension = parts[parts.length - 1]; 60 61 // it is actually no extension but some weird path like 62 // /example.folder/examplesite 63 if (extension.includes("/")) return true; 64 65 // just a website with extension 66 if ( 67 ["html", "htm", "php", "asp", "aspx", "shtml"].includes( 68 extension 69 ) 70 ) 71 return true; 72 73 // it has to be an evil file 74 return false; 75 } 76 }); 77 78 // add the links to the lists if they are not already there 79 links.forEach((l) => { 80 if (l.hostname != c.w.location.hostname) { 81 // external link 82 // push to list if not already there 83 if (!externalLinks.includes(urlf(l))) 84 externalLinks.push(urlf(l)); 85 } else { 86 if ( 87 !internalCrawled.includes(urlf(l)) && 88 !internalNotCrawled.includes(urlf(l)) 89 ) { 90 // internal link, which is not already part of the lists 91 internalNotCrawled.push(urlf(l)); 92 } 93 } 94 }); 95 } 96 console.log("finished crawling"); 97 })();