yt-forager/lib.js

269 lines
7.3 KiB
JavaScript

const axios = require("axios");
const cheerio = require("cheerio");
const ytdl = require("ytdl-core");
let parsers = [
{
src: "parsers/2015.js",
dates: [
{ year: 2015 },
{ year: 2016 },
{ year: 2017 },
{ year: 2018 },
{ year: 2019 }
]
}
];
module.exports = {
getCompatibleDates: getCompatibleDates, // get all parsible dates
handleArchiveUrl: handleArchiveUrl, // parser of web.archive.org urls
toNum: toNum, // ex: get "3K" to become 3000
fetchArchive: fetchArchive, // use wayback api to get snapshot of a URL
altUrl: altUrl, // fetch an alternative URL to the one given
isAlive: isAlive, // check if a youtube video is still alive
isArchived: isArchived, // check if a youtube video is archived
findThumbnail: findThumbnail // fetch the highest quality thumbnail
}
function getCompatibleDates() {
let allDates = [];
for (let i in parsers) {
let parser = parsers[i];
for (let a in parser.dates) {allDates.push(parser.dates[a]);}
}
return allDates;
}
function handleArchiveUrl(url) {
if (url == null) return null;
let parsedUrl = new URL(url);
let obj = {}; // data object to return
let date = parsedUrl.pathname.split(`/web/`)?.[1]?.split(`/`)?.[0];
if (!date) throw new Error("Invalid URL.");
if (date.endsWith("im_") || date.endsWith("if_")) {
obj.type = "direct";
date = date.substring(0, date.length - 3);
}
else obj.type = "friendly";
if (obj.type == "direct") {
obj.direct = parsedUrl.href;
obj.friendly = `https://web.archive.org/web/${date.split(`im_`)[0]}/${parsedUrl.href.split(`/web/`)?.[1]?.split(`/`)?.slice(1).join(`/`)}`
} else {
obj.friendly = parsedUrl.href;
obj.direct = `https://web.archive.org/web/${date}if_/${parsedUrl.href.split(`/web/`)?.[1]?.split(`/`)?.slice(1).join(`/`)}`
}
obj.date = {};
obj.date.year = parseInt(date.substring(0, 4));
obj.date.month = parseInt(date.substring(4, 6));
obj.date.day = parseInt(date.substring(6, 8));
obj.date.hour = parseInt(date.substring(8, 10));
obj.date.minute = parseInt(date.substring(10, 12));
obj.date.second = parseInt(date.substring(12, 14));
obj.url = parsedUrl.href.split(`/web/`)?.[1]?.split(`/`)?.slice(1).join(`/`);
if (
obj.url.includes("/user/") ||
obj.url.includes("/channel/") ||
obj.url.includes("/c/") ||
obj.url.includes("/profile?")
) {
if (obj.url.endsWith("/videos")) obj.urlType = "channel_videos";
else if (obj.url.endsWith("/playlists")) obj.urlType = "channel_playlists";
else if (obj.url.endsWith("/channels")) obj.urlType = "channel_featured_channels";
else if (obj.url.endsWith("/discussion")) obj.urlType = "channel_discussion";
else if (obj.url.endsWith("/about")) obj.urlType = "channel_about";
else obj.urlType = "channel";
}
else if (obj.url.includes("watch")) obj.urlType = "video";
else if (obj.url.includes("playlist")) obj.urlType = "playlist";
else obj.urlType = "unknown";
obj.parser = getParser(obj.date);
return obj;
}
function getParser(date) {
for (let i in parsers) {
for (let a in parsers[i].dates) {
let pDate = parsers[i].dates[a];
if (pDate.year == date.year) {
if (
!pDate.month ||
pDate.month == date.month
) return require(`${__dirname}/${parsers[i].src}`)
}
}
}
return null;
}
function toNum(string) {
// to convert "3K" => 3000
if (typeof string !== "string") return string;
string = string.toLowerCase();
string = string.split(`,`).join(``) // remove commas
string = string.split(` `).join(``); // remove spacing (for langs that dont have commas)
string = string.split(`views`).join(``); // remove label if it's there
string = string.split(`переглядів`).join(``) // remove russian label
string = string.split(`\n`).join(``).split(`\t`).join(``); // remove any possible spacing
let mult = string.replace(/^[+-]?(\d*|\d{1,3}(,\d{3})*)(\.\d+)?/g,'')
let num = string.split(`k`).join(``).split(`m`).join(``).split(`b`).join(``).split(` `).join(``);
num = parseFloat(num);
switch(mult) {
case "k":
num = num * 1000;
return num;
case "m":
num = num * 1000000;
return num;
case "b":
num = num * 1000000000;
return num;
default: return num;
}
}
async function fetchArchive(url, date) {
try {
let dateString = ``;
if (date) {
if (date.year) dateString = `${date.year}`;
if (date.month) dateString = `${dateString}${pad(date.month, 2)}`;
if (date.day) dateString = `${dateString}${pad(date.day, 2)}`;
}
url = encodeURIComponent(url);
let resp = await axios({
url: `https://archive.org/wayback/available?url=${url}&timestamp=${dateString}`,
validateStatus: function () {return true;}
});
if (!resp.data?.archived_snapshots?.closest?.status?.startsWith("2") && resp.data?.archived_snapshots?.closest?.status?.startsWith("3")) return null;
// prevents deadlinks
return (resp.data?.archived_snapshots?.closest?.url || null);
} catch(err) {
console.log(err);
return null;
}
}
async function altUrl(url) {
try {
let resp = await axios({url: url, validateStatus: function() {return true}});
if (resp.status == 404) resp = await axios({url: fetchArchive(url), validateStatus: function() {return true}})
let $ = cheerio.load(resp.data);
let channel = $(`[itemprop="author"] > [itemprop="url"]`).attr("href");
return channel;
} catch(err) {
return null;
}
}
function pad(num, size) {
var s = num + "";
while (s.length < size) s = "0" + s;
return s;
}
async function isAlive(url) {
try {
await ytdl.getBasicInfo(url);
return true;
} catch(err) {
return false;
}
}
async function isArchived(url, title) {
let results = [];
let status = false;
let id = url?.split(`v=`)?.[1]?.split(`&`)?.[0];
if (!id) id = url;
let direct = await fetchArchive(`http://wayback-fakeurl.archive.org/yt/${id}`);
if (direct !== null) {
status = true;
results.push(handleArchiveUrl(direct).direct);
}
let resp = await axios({
url: `https://archive.org/details/youtube-${id}`,
validateStatus: function() {return true}
});
if (resp.status == 200) {
status = true;
results.push(`https://archive.org/details/youtube-${id}`);
}
if (results.length == 0 && title) {
let resp = await axios({
url: `https://archive.org/search.php?query=${encodeURIComponent(title)}`
});
let $ = cheerio.load(resp.data);
let item = $(".C234 > div > a")?.[0];
let iTitle = $(".C234 > div > a > .ttl")?.[0]?.children?.[0]?.data;
if (item && title.toLowerCase() == iTitle.toLowerCase()) {
status = true;
let url = `https://archive.org${item?.attribs?.href}`;
results.push(url);
}
}
return {
success: status,
results: results
}
}
async function findThumbnail(url) {
let id = url?.split(`v=`)?.[1]?.split(`&`)?.[0];
if (!id) id = url;
let qualities = [
"maxresdefault",
"hqdefault"
];
let servers = [
"i.ytimg.com",
"i2.ytimg.com",
"i3.ytimg.com",
"i4.ytimg.com",
"img.ytimg.com",
];
for (let a in qualities) {
for (let i in servers) {
let archived = await fetchArchive({url: `https://${servers[i]}/vi/${id}/${qualities[a]}.jpg`});
if (archived !== null) return archived;
}
}
return null;
}