ultraarchive/make-archives.js

280 lines
11 KiB
JavaScript

import axios from "axios";
import * as cheerio from "cheerio";
import twocaptcha from "2captcha";
import fs from "fs";
const config = JSON.parse(fs.readFileSync(`${process.cwd()}/config.json`));
async function create(url, {wayback, archivedotis, log}, onUpdate) {
if (!onUpdate) onUpdate = function() {};
if (log != false) console.log(`request to create archive recieved`);
if (log != false) console.log(`- url:`, url);
if (log != false) console.log(`- archive.org:`, wayback);
if (log != false) console.log(`- archive.is:`, archivedotis);
let wba, adi;
if (wayback == true) {
try {
if (log != false) console.log(`[archiver] sending request to backup to the wayback machine`);
onUpdate({currentArchiver: "wayback", status: "pending", message: "starting process"});
wba = await archiveorg(url, {log, captureAll: true}, onUpdate);
onUpdate({currentArchiver: "wayback", status: wba.status, url: wba.url});
} catch(err) {
onUpdate({currentArchiver: "wayback", status: "error", message: (err.message || err.stack || err.code)});
}
} else wba = { status: "disabled" };
if (archivedotis == true) {
try {
if (log != false) console.log(`[archiver] sending request to backup to archive.is`);
onUpdate({currentArchiver: "archivedotis", status: "starting"});
adi = await archiveis(url, {log}, onUpdate);
onUpdate({currentArchiver: "archivedotis", status: adi.status, url: adi.url});
} catch(err) {
onUpdate({currentArchiver: "archivedotis", status: "error", message: (err.message || err.stack || err.code)});
}
} else adi = { status: "disabled" };
return { wayback: wba, archivedotis: adi };
}
async function archiveorg(captureUrl, {log, captureAll}, onUpdate) {
let data = `url=${encodeURIComponent(captureUrl)}`;
if (captureAll != false) data = `${data}&captcha_all=on`;
else data = `${data}&captcha_all=off`;
let dataLength = encodeURI(data).split(/%..|./).length - 1;
return new Promise(async function(resolve, reject) {
try {
if (log != false) console.log(`[wayback] sending initial request to begin archiving...`);
let init = await axios({
method: `POST`,
url: `https://web.archive.org/save/${encodeURIComponent(captureUrl)}`,
data,
headers: {
"Content-Length": dataLength,
"Content-Type": "application/x-www-form-urlencoded",
"Referer": "https://web.archive.org/save"
}
});
let spinJob = init.data?.split(`spn.watchJob("`)?.[1]?.split(`",`)?.[0];
if (!spinJob) {
let $ = cheerio.load(init.data);
if ($(".error.save-page-form")[0]) {
let error = $(".error.save-page-form").text();
reject(error);
return;
}
}
if (log != false) console.log(`[wayback] got response from request, spin job id:`, spinJob);
let interval = setInterval(async function() {
let {data} = await axios({
method: `GET`,
url: `https://web.archive.org/save/status/${spinJob}?t=${(new Date()).toString()}`
});
if (data.status == "success") {
clearInterval(interval);
resolve({ status: "success", url: `https://web.archive.org/web/${data.timestamp}/${data.original_url}`, duration: data.duration_sec * 1000 });
} else {
if (data.status == "pending") onUpdate({currentArchiver: "wayback", status: "pending", message: "archiving"});
else if (data.status == "error") {
clearInterval(interval);
reject(new Error(data.message));
} else {
clearInterval(interval);
reject(new Error("cannot parse job response."));
}
}
}, 6000);
} catch(err) {
if (!err.axiosError) reject(err);
}
});
}
async function archiveis(captureUrl, {log, autoAgain}, onUpdate) {
if (!autoAgain) autoAgain = true;
if (log != false) console.log(`[archiveis] fetching initial request to get request id`);
onUpdate({currentArchiver: "archivedotis", status: "pending", message: "fetching request id"});
return new Promise(async function(resolve, reject) {
try {
let res = await axios({
url: `https://archive.ph/`,
headers: {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"DNT": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "cross-site",
"Sec-GPC": "1",
"TE": "trailers",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:110.0) Gecko/20100101 Firefox/110.0"
}
});
let $ = cheerio.load(res.data);
let subId = $(`[name="submitid"]`).val();
if (log != false) console.log(`[archiveis] got submit id:`, subId);
let subUrl = `https://archive.ph/submit/?anyway=1&submitid=${subId}&url=${encodeURIComponent(captureUrl)}`
if (log != false) console.log(`[archiveis] got submit url:`, subUrl);
onUpdate({currentArchiver: "archivedotis", status: "pending", message: "initializing submission"});
let msStart = (new Date() * 1);
res = await axios({
url: subUrl,
headers: {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"DNT": "1",
"Referer": "https://archive.ph/",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "cross-site",
"Sec-GPC": "1",
"TE": "trailers",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:110.0) Gecko/20100101 Firefox/110.0"
},
validateStatus: function() {return true;}
});
$ = cheerio.load(res.data);
let title = $("title")?.text();
let wipUrl, cookie;
if (typeof title == "string") {
onUpdate({currentArchiver: "archivedotis", status: "pending", message: "sending request for captcha"});
if (log != false) console.log(`[archiveis] sending captcha request`);
// captcha page
let tc = new twocaptcha.Solver(config["2captcha"]);
let sk = res.data.split(`'sitekey': '`)?.[1]?.split(`'`)?.[0];
onUpdate({currentArchiver: "archivedotis", status: "pending", message: "solving captcha"})
if (log != false) console.log(`[archiveis] getting captcha solution`);
let captcha = await (await tc.recaptcha(sk, subUrl)).data;
let captData = `response=${encodeURIComponent(captcha)}&location=${subUrl}`;
let dataLength = encodeURI(captData).split(/%..|./).length - 1;
let {headers} = await axios({
method: `POST`,
url: `https://archive.ph/cdn-cgi/l/chk_captcha`,
data: captData,
headers: {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Content-Length": dataLength,
"Content-Type": "application/x-www-form-urlencoded",
"DNT": "1",
"Referer": "https://archive.ph/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"Sec-GPC": "1",
"TE": "trailers",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:110.0) Gecko/20100101 Firefox/110.0"
}
});
let sc = headers["set-cookie"]?.[0]?.split(`;`)[0];
cookie = `${encodeURIComponent(sc.split(`=`)[0])}=${encodeURIComponent(sc.split(`=`)[1])}`;
onUpdate({currentArchiver: "archivedotis", status: "pending", message: "resending submission"});
msStart = (new Date() * 1);
if (log != false) console.log(`[archiveis] resending archive request`);
res = await axios({
url: subUrl,
headers: {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Cookie": cookie,
"DNT": "1",
"Referer": "https://archive.ph/",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "cross-site",
"Sec-GPC": "1",
"TE": "trailers",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:110.0) Gecko/20100101 Firefox/110.0"
},
validateStatus: function() {return true;}
});
if (res.headers["refresh"]) wipUrl = res.headers["refresh"].split(`url=`).slice(1).join(`url=`);
else {
let $ = cheerio.load(res.data);
let url = $(`link[rel="canonical"]`).attr("href");
resolve({ status: "success", url, duration: null, new: false });
}
}
let updateNum = 1;
let interval = setInterval(async function() {
try {
let {headers} = await axios({
method: "GET",
url: wipUrl,
validateStatus: function() {return true;},
headers: {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Cookie": cookie,
"DNT": "1",
"Referer": wipUrl,
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "cross-site",
"Sec-GPC": "1",
"TE": "trailers",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:110.0) Gecko/20100101 Firefox/110.0"
}
});
if (headers.link) {
onUpdate({currentArchiver: "archivedotis", status: "pending", message: "parsing result page"});
let wipParsed = new URL(wipUrl);
let id = wipParsed.pathname.split(`/`)[2];
if (log != false) console.log(`[archiveis] got id:`, id);
clearInterval(interval);
resolve({ status: "success", url: `https://archive.ph/${id}`, duration: ((new Date() * 1) - msStart), new: true });
} else {
if (log != false) console.log(`[archiveis] no redirect found, refreshing in 5 seconds`);
updateNum = updateNum + 1;
onUpdate({currentArchiver: "archivedotis", status: "pending", message: `archiving (request ${updateNum})`});
}
} catch(err) {
console.log(err);
}
}, 5000);
} catch(err) {
reject(err);
}
});
}
export default {
create
};