ramajs

web browser automation from your browser console
git clone https://tongong.net/git/ramajs.git
Log | Files | Refs | README | LICENSE

commit 12164bd41ac73b639e27b0f550fa61d9cac46e94
parent e5bf2bc9267c1edf5f2183406bd16a583e3ba3ba
Author: tongong <tongong@gmx.net>
Date:   Tue,  1 Dec 2020 17:23:04 +0100

added an example script

Diffstat:
MREADME.md | 4++++
Aexamples/crawler.js | 97+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 101 insertions(+), 0 deletions(-)

diff --git a/README.md b/README.md @@ -185,6 +185,10 @@ NOTICE: inline styles given to specific elements stay! rama.clearcss(); ``` +## examples + +See examples folder + ## other projects I think, at the moment this is the only project with this main idea. But one can diff --git a/examples/crawler.js b/examples/crawler.js @@ -0,0 +1,97 @@ +// this script crawls through all connected pages on the same domain as it +// started and generates one array of all the pages and one of all referenced +// pages on other domains + +// run in the console after loading rama.js + +// links pointing to a different domain +externalLinks = []; +// links on the same domain, which were already crawled +internalCrawled = []; +// links on the same domain, which have to be crawled +internalNotCrawled = []; + +// returns the url of a location object without the URL fragment +// (https://en.wikipedia.org/wiki/URI_fragment) +// -> no page has to be loaded twice +function urlf(location) { + return location.href.replace(location.hash, ""); +} + +// Anonymous function wrapper needed for await to work +(async () => { + // initialize the rama.js client + rama.clearpage(); + c = rama.new(); + + await c.waitForReload(); + internalNotCrawled.push(c.w.location.href); + + while (internalNotCrawled.length > 0) { + // visit the next page + let newUrl = internalNotCrawled.shift(); + internalCrawled.push(newUrl); + c.w.location = newUrl; + console.log("visiting " + newUrl); + console.log( + "internal: " + + internalCrawled.length + + " crawled and " + + internalNotCrawled.length + + " remaining. external: " + + externalLinks.length + ); + await c.waitForReload(); + + // get all <a> tags + links = c.d.qsa("a"); + + // filter out all urls of filetypes that are not websites + links = links.filter((l) => { + let parts = l.pathname.split("."); + + // just a normal path like /examplefolder/examplesite + if (parts.length == 1) return true; + else { + // file with extension like /examplefolder/examplefile.txt or a + // weird path like /example.folder/examplesite or + // /example/.folder/file.txt + let extension = parts[parts.length - 1]; + + // it is actually no extension but some weird path like + // /example.folder/examplesite + if (extension.includes("/")) return true; + + // just a website with extension + if ( + ["html", "htm", "php", "asp", "aspx", "shtml"].includes( + extension + ) + ) + return true; + + // it has to be an evil file + return false; + } + }); + + // add the links to the lists if they are not already there + links.forEach((l) => { + if (l.hostname != c.w.location.hostname) { + // external link + // push to list if not already there + if (!externalLinks.includes(urlf(l))) + externalLinks.push(urlf(l)); + } else { + if ( + !internalCrawled.includes(urlf(l)) && + !internalNotCrawled.includes(urlf(l)) + ) { + // internal link, which is not already part of the lists + internalNotCrawled.push(urlf(l)); + } + } + }); + } + console.log("finished crawling"); +})();