added an example script - ramajs - web browser automation from your browser console

commit 12164bd41ac73b639e27b0f550fa61d9cac46e94
parent e5bf2bc9267c1edf5f2183406bd16a583e3ba3ba
Author: tongong <tongong@gmx.net>
Date:   Tue,  1 Dec 2020 17:23:04 +0100

added an example script

Diffstat:
M README.md  | 4 ++++
A examples/crawler.js  | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

2 files changed, 101 insertions(+), 0 deletions(-)
diff --git a/README.md b/README.md
@@ -185,6 +185,10 @@ NOTICE: inline styles given to specific elements stay!
 rama.clearcss();
 ```
 
+## examples
+
+See examples folder
+
 ## other projects
 
 I think, at the moment this is the only project with this main idea. But one can
diff --git a/examples/crawler.js b/examples/crawler.js
@@ -0,0 +1,97 @@
+// this script crawls through all connected pages on the same domain as it
+// started and generates one array of all the pages and one of all referenced
+// pages on other domains
+
+// run in the console after loading rama.js
+
+// links pointing to a different domain
+externalLinks = [];
+// links on the same domain, which were already crawled
+internalCrawled = [];
+// links on the same domain, which have to be crawled
+internalNotCrawled = [];
+
+// returns the url of a location object without the URL fragment
+// (https://en.wikipedia.org/wiki/URI_fragment)
+// -> no page has to be loaded twice
+function urlf(location) {
+    return location.href.replace(location.hash, "");
+}
+
+// Anonymous function wrapper needed for await to work
+(async () => {
+    // initialize the rama.js client
+    rama.clearpage();
+    c = rama.new();
+
+    await c.waitForReload();
+    internalNotCrawled.push(c.w.location.href);
+
+    while (internalNotCrawled.length > 0) {
+        // visit the next page
+        let newUrl = internalNotCrawled.shift();
+        internalCrawled.push(newUrl);
+        c.w.location = newUrl;
+        console.log("visiting " + newUrl);
+        console.log(
+            "internal: " +
+                internalCrawled.length +
+                " crawled and " +
+                internalNotCrawled.length +
+                " remaining. external: " +
+                externalLinks.length
+        );
+        await c.waitForReload();
+
+        // get all <a> tags
+        links = c.d.qsa("a");
+
+        // filter out all urls of filetypes that are not websites
+        links = links.filter((l) => {
+            let parts = l.pathname.split(".");
+
+            // just a normal path like /examplefolder/examplesite
+            if (parts.length == 1) return true;
+            else {
+                // file with extension like /examplefolder/examplefile.txt or a
+                // weird path like /example.folder/examplesite or
+                // /example/.folder/file.txt
+                let extension = parts[parts.length - 1];
+
+                // it is actually no extension but some weird path like
+                // /example.folder/examplesite
+                if (extension.includes("/")) return true;
+
+                // just a website with extension
+                if (
+                    ["html", "htm", "php", "asp", "aspx", "shtml"].includes(
+                        extension
+                    )
+                )
+                    return true;
+
+                // it has to be an evil file
+                return false;
+            }
+        });
+
+        // add the links to the lists if they are not already there
+        links.forEach((l) => {
+            if (l.hostname != c.w.location.hostname) {
+                // external link
+                // push to list if not already there
+                if (!externalLinks.includes(urlf(l)))
+                    externalLinks.push(urlf(l));
+            } else {
+                if (
+                    !internalCrawled.includes(urlf(l)) &&
+                    !internalNotCrawled.includes(urlf(l))
+                ) {
+                    // internal link, which is not already part of the lists
+                    internalNotCrawled.push(urlf(l));
+                }
+            }
+        });
+    }
+    console.log("finished crawling");
+})();

	ramajs web browser automation from your browser console
	git clone https://tongong.net/git/ramajs.git
	Log \| Files \| Refs \| README \| LICENSE

M	README.md	\|	4	++++
A	examples/crawler.js	\|	97	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++