231 lines
10 KiB
JavaScript
231 lines
10 KiB
JavaScript
const axios = require("axios");
|
|
const cheerio = require("cheerio");
|
|
const lib = require("../lib");
|
|
|
|
// pre-polymer redesign pages
|
|
|
|
module.exports = {
|
|
parse: parse
|
|
};
|
|
|
|
async function parse(url) {
|
|
let parsed = lib.handleArchiveUrl(url);
|
|
|
|
let resp = await axios({
|
|
url: parsed.direct
|
|
});
|
|
|
|
let data;
|
|
if (resp.headers?.["content-type"].includes("text/html")) {
|
|
// handle regular html pages
|
|
|
|
let $ = cheerio.load(resp.data);
|
|
data = handleHtml($, parsed.urlType);
|
|
} else if (resp.headers?.["content-type"].includes("application/json")) {
|
|
data = handleSPF(resp.data, parsed.urlType);
|
|
}
|
|
|
|
return data;
|
|
}
|
|
|
|
function handleHtml($, type) {
|
|
let data = {};
|
|
data.type = type;
|
|
|
|
switch(type) {
|
|
case "channel":
|
|
data.videos = [];
|
|
data.id = $("head > [itemprop='channelId']").attr("content");
|
|
|
|
if ($(".video-player-view-component")[0]) {
|
|
// channel trailer
|
|
// example: https://web.archive.org/web/20160604222209im_/https://www.youtube.com/c/EmperorLemon
|
|
|
|
let id = $(".video-player-view-component .video-detail > h3 > a")[0].attribs?.href?.split(`v=`)?.[1]?.split(`&`)[0];
|
|
if (id) {
|
|
let views = $(".video-player-view-component .video-detail > .view-count > .count")[0].children[0].data.split(`\n`).join(``).split(`\t`).join(``).split(` `).join(``).split(`views`).join(``);
|
|
views = lib.toNum(views);
|
|
data.videos.push({
|
|
type: "video",
|
|
id: id,
|
|
title: $(".video-player-view-component .video-detail > h3 > a")[0].children[0].data,
|
|
views: views
|
|
});
|
|
}
|
|
}
|
|
|
|
if ($("#browse-items-primary")[0]) {
|
|
if (!$("#browse-items-primary .branded-page-box")[0]) {
|
|
// example: https://web.archive.org/web/20160820015628im_/https://www.youtube.com/channel/UCE6acMV3m35znLcf0JGNn7Q
|
|
// feed page parsing
|
|
for (let i in $("#browse-items-primary .legacy-style .yt-lockup-video")) {
|
|
let obj = {};
|
|
obj.type = "video";
|
|
obj.id = $("#browse-items-primary .legacy-style .yt-lockup-video")[i]?.attribs?.["data-context-item-id"];
|
|
|
|
if (!obj.id) continue;
|
|
|
|
obj.uploader = {};
|
|
obj.uploader.id = $("#browse-items-primary .legacy-style .yt-lockup-video > .yt-lockup-dismissable > .yt-lockup-content > .yt-lockup-byline > a")[i].attribs?.["data-ytid"];
|
|
obj.uploader.name = $("#browse-items-primary .legacy-style .yt-lockup-video > .yt-lockup-dismissable > .yt-lockup-content > .yt-lockup-byline > a")[i].children?.[0]?.data;
|
|
|
|
obj.views = $("#browse-items-primary .legacy-style .yt-lockup-video .yt-lockup-meta .yt-lockup-meta-info")[i].children?.[2]?.children[0]?.data?.split(` `)?.[0].split(`,`)?.join(``);
|
|
obj.views = parseInt(obj.views);
|
|
|
|
obj.title = $("#browse-items-primary .legacy-style .yt-lockup-video > .yt-lockup-dismissable > .yt-lockup-content > .yt-lockup-title > a")[i].attribs?.title;
|
|
|
|
data.videos.push(obj);
|
|
}
|
|
} else if ($("#browse-items-primary .branded-page-box")[0]) {
|
|
// grid page parsing
|
|
// example: https://web.archive.org/web/20170525111347im_/https://www.youtube.com/channel/UCE6acMV3m35znLcf0JGNn7Q
|
|
|
|
for (let i in $("#browse-items-primary .compact-shelf .yt-lockup-video.yt-lockup-grid")) {
|
|
let obj = {};
|
|
obj.type = "video";
|
|
obj.id = $("#browse-items-primary .compact-shelf .yt-lockup-video.yt-lockup-grid")[i]?.attribs?.["data-context-item-id"];
|
|
|
|
if (!obj.id) continue;
|
|
|
|
// i should possibly add a thing to declare the source of the
|
|
// video, like what shelf it came from
|
|
|
|
obj.views = $("#browse-items-primary .compact-shelf .yt-lockup-video.yt-lockup-grid .yt-lockup-meta .yt-lockup-meta-info")[i].children?.[0]?.children[0]?.data?.split(` `)?.[0].split(`,`)?.join(``);
|
|
obj.views = lib.toNum(obj.views);
|
|
|
|
obj.title = $("#browse-items-primary .compact-shelf .yt-lockup-video.yt-lockup-grid > .yt-lockup-dismissable > .yt-lockup-content > .yt-lockup-title > a")[i].attribs?.title;
|
|
|
|
if ($("#browse-items-primary .compact-shelf .yt-lockup-video.yt-lockup-grid > .yt-lockup-dismissable > .yt-lockup-content")[i]) {
|
|
let child = $("#browse-items-primary .compact-shelf .yt-lockup-video.yt-lockup-grid > .yt-lockup-dismissable > .yt-lockup-content")[i].children;
|
|
let isByChannel = true;
|
|
for (let a in child) {
|
|
if (child[a]?.attribs?.class == "yt-lockup-byline" && child[a].children?.[1]?.attribs?.["data-ytid"] !== data.id && isByChannel == true) isByChannel = false;
|
|
}
|
|
if (isByChannel == false) continue;
|
|
}
|
|
|
|
data.videos.push(obj);
|
|
}
|
|
} else
|
|
|
|
if ($(".yt-subscription-button-subscriber-count-branded-horizontal")[0]) {
|
|
let raw = $(".yt-subscription-button-subscriber-count-branded-horizontal")[0]?.attribs?.title;
|
|
let num = raw.split(`,`).join(``);
|
|
num = lib.toNum(num);
|
|
data.subcount = num;
|
|
} else data.subcount = 0;
|
|
|
|
if ($(".primary-header-upper-section .qualified-channel-title-text > a")[0]) data.name = $(".primary-header-upper-section .qualified-channel-title-text > a")[0]?.attribs?.title;
|
|
|
|
if ($(".branded-page-related-channels-list .branded-page-related-channels-item")[0]) {
|
|
data.relatedChannels = [];
|
|
|
|
for (let i in $(".branded-page-related-channels-list .branded-page-related-channels-item")) {
|
|
let channel = {};
|
|
channel.id = $(".branded-page-related-channels-list .branded-page-related-channels-item")[i].attribs?.["data-external-id"];
|
|
|
|
if (!channel.id) continue;
|
|
|
|
channel.url = `https://www.youtube.com/channel/${channel.id}`;
|
|
channel.name = $(".branded-page-related-channels-list .branded-page-related-channels-item .yt-lockup-title > a")[i]?.attribs?.title;
|
|
data.relatedChannels.push(channel);
|
|
}
|
|
}
|
|
} else {
|
|
console.log(`This is likely a polymer page under archive.org's radar. The polymer era is currently not supported.`);
|
|
return null;
|
|
}
|
|
return data;
|
|
|
|
case "video":
|
|
data.id = $("head > [rel='canonical']")?.attr("href")?.split("v=")?.[1]?.split("&")?.[0];
|
|
if (!data.id) {
|
|
console.log(`This is likely a polymer page under archive.org's radar. The polymer era is currently not supported.`);
|
|
return null;
|
|
}
|
|
data.title = $(".watch-title")[0]?.attribs?.title.split(`\n`).join(``).split(`\t`).join(``);
|
|
data.uploader = {
|
|
type: "channel",
|
|
id: $(`#watch7-content > [itemprop="channelId"]`).attr("content"),
|
|
url: `https://www.youtube.com/channel/${$(`#watch7-content > [itemprop="channelId"]`).attr("content")}`,
|
|
name: $("#watch7-user-header > .yt-user-info > a")[0]?.children?.[0]?.data,
|
|
subcount: lib.toNum($(".yt-subscription-button-subscriber-count-branded-horizontal.yt-subscriber-count")[0]?.attribs?.title?.split(`,`)?.join(``))
|
|
};
|
|
|
|
data.description = $("#eow-description").html().split(`<br>`).join(`\n`);
|
|
|
|
data.relatedVideos = [];
|
|
|
|
for (let i in $(".video-list-item")) {
|
|
let obj = {};
|
|
obj.type = "video";
|
|
obj.id = $(".video-list-item > .content-wrapper > a")[i]?.attribs?.href?.split(`v=`)[1].split(`&`)[0];
|
|
|
|
if (!obj.id) continue;
|
|
obj.title = $(".video-list-item > .content-wrapper > a")[i]?.attribs?.title;
|
|
|
|
obj.uploader = {};
|
|
obj.uploader.id = $(".video-list-item > .content-wrapper > .stat.attribution > .g-hovercard")[i]?.attribs?.["data-ytid"];
|
|
obj.uploader.url = `https://www.youtube.com/channel/${obj.uploader.id}`;
|
|
obj.uploader.name = $(".video-list-item > .content-wrapper > .stat.attribution > .g-hovercard")[i]?.children?.[0]?.data;
|
|
|
|
obj.views = parseInt($(".video-list-item > .content-wrapper .stat.view-count")[i]?.children?.[0].data.split(`,`).join(``));
|
|
|
|
data.relatedVideos.push(obj);
|
|
}
|
|
|
|
data.views = parseInt($(".watch-view-count")[0]?.children?.[0]?.data?.split(`,`).join(``));
|
|
data.likes = parseInt($(".like-button-renderer-like-button > span")[0]?.children?.[0]?.data?.split(`,`).join(``));
|
|
data.dislikes = parseInt($(".like-button-renderer-dislike-button > span")[0]?.children?.[0]?.data?.split(`,`).join(``));
|
|
|
|
data.publishDate = {
|
|
year: parseInt($("[itemprop='datePublished']").attr("content")?.split(`-`)[0]),
|
|
month: parseInt($("[itemprop='datePublished']").attr("content")?.split(`-`)[1]),
|
|
day: parseInt($("[itemprop='datePublished']").attr("content")?.split(`-`)[2])
|
|
}
|
|
return data;
|
|
|
|
default:
|
|
console.log("cannot parse this page.")
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function handleSPF(data, type) {
|
|
// parse spf swf config
|
|
|
|
let obj = {};
|
|
obj.type = "video";
|
|
|
|
switch(type) {
|
|
case "video":
|
|
// example: https://web.archive.org/web/20150819140026if_/https://www.youtube.com/watch?v=230Q5CDHH4Q
|
|
|
|
for (let i in data) {
|
|
if (data[i]?.body?.["watch7-container"]) {
|
|
obj.publishDate = {
|
|
year: parseInt(data[i]?.body?.["watch7-container"].split(`itemprop=\"datePublished\" content=\"`)?.[1]?.split(`\"`)?.[0]?.split(`-`)?.[0]),
|
|
month: parseInt(data[i]?.body?.["watch7-container"].split(`itemprop=\"datePublished\" content=\"`)?.[1]?.split(`\"`)?.[0]?.split(`-`)?.[1]),
|
|
day: parseInt(data[i]?.body?.["watch7-container"].split(`itemprop=\"datePublished\" content=\"`)?.[1]?.split(`\"`)?.[0]?.split(`-`)?.[2])
|
|
}
|
|
obj.likes = parseInt(data[i]?.body?.["watch7-container"].split(`aria-label=\"like this video along with `)?.[1]?.split(` other people\"`)?.[0]?.split(`,`).join(``));
|
|
obj.dislikes = parseInt(data[i]?.body?.["watch7-container"].split(`aria-label=\"dislike this video along with `)?.[1]?.split(` other people\"`)?.[0]?.split(`,`).join(``));
|
|
}
|
|
|
|
if (data[i]?.data?.swfcfg) {
|
|
obj.title = data[i].data.swfcfg?.args?.title;
|
|
obj.uploader = {
|
|
name: data[i].data.swfcfg?.args?.author,
|
|
id: data[i].data.swfcfg?.args?.ucid,
|
|
url: `https://www.youtube.com/channel/${data[i].data.swfcfg?.args?.ucid}`
|
|
}
|
|
obj.views = parseInt(data[i].data.swfcfg?.args?.view_count);
|
|
}
|
|
}
|
|
return obj;
|
|
|
|
default:
|
|
console.log("cannot parse this page.")
|
|
return null;
|
|
}
|
|
} |