yt-forager/parsers/2015.js

231 lines
10 KiB
JavaScript

const axios = require("axios");
const cheerio = require("cheerio");
const lib = require("../lib");
// pre-polymer redesign pages
module.exports = {
parse: parse
};
async function parse(url) {
let parsed = lib.handleArchiveUrl(url);
let resp = await axios({
url: parsed.direct
});
let data;
if (resp.headers?.["content-type"].includes("text/html")) {
// handle regular html pages
let $ = cheerio.load(resp.data);
data = handleHtml($, parsed.urlType);
} else if (resp.headers?.["content-type"].includes("application/json")) {
data = handleSPF(resp.data, parsed.urlType);
}
return data;
}
function handleHtml($, type) {
let data = {};
data.type = type;
switch(type) {
case "channel":
data.videos = [];
data.id = $("head > [itemprop='channelId']").attr("content");
if ($(".video-player-view-component")[0]) {
// channel trailer
// example: https://web.archive.org/web/20160604222209im_/https://www.youtube.com/c/EmperorLemon
let id = $(".video-player-view-component .video-detail > h3 > a")[0].attribs?.href?.split(`v=`)?.[1]?.split(`&`)[0];
if (id) {
let views = $(".video-player-view-component .video-detail > .view-count > .count")[0].children[0].data.split(`\n`).join(``).split(`\t`).join(``).split(` `).join(``).split(`views`).join(``);
views = lib.toNum(views);
data.videos.push({
type: "video",
id: id,
title: $(".video-player-view-component .video-detail > h3 > a")[0].children[0].data,
views: views
});
}
}
if ($("#browse-items-primary")[0]) {
if (!$("#browse-items-primary .branded-page-box")[0]) {
// example: https://web.archive.org/web/20160820015628im_/https://www.youtube.com/channel/UCE6acMV3m35znLcf0JGNn7Q
// feed page parsing
for (let i in $("#browse-items-primary .legacy-style .yt-lockup-video")) {
let obj = {};
obj.type = "video";
obj.id = $("#browse-items-primary .legacy-style .yt-lockup-video")[i]?.attribs?.["data-context-item-id"];
if (!obj.id) continue;
obj.uploader = {};
obj.uploader.id = $("#browse-items-primary .legacy-style .yt-lockup-video > .yt-lockup-dismissable > .yt-lockup-content > .yt-lockup-byline > a")[i].attribs?.["data-ytid"];
obj.uploader.name = $("#browse-items-primary .legacy-style .yt-lockup-video > .yt-lockup-dismissable > .yt-lockup-content > .yt-lockup-byline > a")[i].children?.[0]?.data;
obj.views = $("#browse-items-primary .legacy-style .yt-lockup-video .yt-lockup-meta .yt-lockup-meta-info")[i].children?.[2]?.children[0]?.data?.split(` `)?.[0].split(`,`)?.join(``);
obj.views = parseInt(obj.views);
obj.title = $("#browse-items-primary .legacy-style .yt-lockup-video > .yt-lockup-dismissable > .yt-lockup-content > .yt-lockup-title > a")[i].attribs?.title;
data.videos.push(obj);
}
} else if ($("#browse-items-primary .branded-page-box")[0]) {
// grid page parsing
// example: https://web.archive.org/web/20170525111347im_/https://www.youtube.com/channel/UCE6acMV3m35znLcf0JGNn7Q
for (let i in $("#browse-items-primary .compact-shelf .yt-lockup-video.yt-lockup-grid")) {
let obj = {};
obj.type = "video";
obj.id = $("#browse-items-primary .compact-shelf .yt-lockup-video.yt-lockup-grid")[i]?.attribs?.["data-context-item-id"];
if (!obj.id) continue;
// i should possibly add a thing to declare the source of the
// video, like what shelf it came from
obj.views = $("#browse-items-primary .compact-shelf .yt-lockup-video.yt-lockup-grid .yt-lockup-meta .yt-lockup-meta-info")[i].children?.[0]?.children[0]?.data?.split(` `)?.[0].split(`,`)?.join(``);
obj.views = lib.toNum(obj.views);
obj.title = $("#browse-items-primary .compact-shelf .yt-lockup-video.yt-lockup-grid > .yt-lockup-dismissable > .yt-lockup-content > .yt-lockup-title > a")[i].attribs?.title;
if ($("#browse-items-primary .compact-shelf .yt-lockup-video.yt-lockup-grid > .yt-lockup-dismissable > .yt-lockup-content")[i]) {
let child = $("#browse-items-primary .compact-shelf .yt-lockup-video.yt-lockup-grid > .yt-lockup-dismissable > .yt-lockup-content")[i].children;
let isByChannel = true;
for (let a in child) {
if (child[a]?.attribs?.class == "yt-lockup-byline" && child[a].children?.[1]?.attribs?.["data-ytid"] !== data.id && isByChannel == true) isByChannel = false;
}
if (isByChannel == false) continue;
}
data.videos.push(obj);
}
} else
if ($(".yt-subscription-button-subscriber-count-branded-horizontal")[0]) {
let raw = $(".yt-subscription-button-subscriber-count-branded-horizontal")[0]?.attribs?.title;
let num = raw.split(`,`).join(``);
num = lib.toNum(num);
data.subcount = num;
} else data.subcount = 0;
if ($(".primary-header-upper-section .qualified-channel-title-text > a")[0]) data.name = $(".primary-header-upper-section .qualified-channel-title-text > a")[0]?.attribs?.title;
if ($(".branded-page-related-channels-list .branded-page-related-channels-item")[0]) {
data.relatedChannels = [];
for (let i in $(".branded-page-related-channels-list .branded-page-related-channels-item")) {
let channel = {};
channel.id = $(".branded-page-related-channels-list .branded-page-related-channels-item")[i].attribs?.["data-external-id"];
if (!channel.id) continue;
channel.url = `https://www.youtube.com/channel/${channel.id}`;
channel.name = $(".branded-page-related-channels-list .branded-page-related-channels-item .yt-lockup-title > a")[i]?.attribs?.title;
data.relatedChannels.push(channel);
}
}
} else {
console.log(`This is likely a polymer page under archive.org's radar. The polymer era is currently not supported.`);
return null;
}
return data;
case "video":
data.id = $("head > [rel='canonical']")?.attr("href")?.split("v=")?.[1]?.split("&")?.[0];
if (!data.id) {
console.log(`This is likely a polymer page under archive.org's radar. The polymer era is currently not supported.`);
return null;
}
data.title = $(".watch-title")[0]?.attribs?.title.split(`\n`).join(``).split(`\t`).join(``);
data.uploader = {
type: "channel",
id: $(`#watch7-content > [itemprop="channelId"]`).attr("content"),
url: `https://www.youtube.com/channel/${$(`#watch7-content > [itemprop="channelId"]`).attr("content")}`,
name: $("#watch7-user-header > .yt-user-info > a")[0]?.children?.[0]?.data,
subcount: lib.toNum($(".yt-subscription-button-subscriber-count-branded-horizontal.yt-subscriber-count")[0]?.attribs?.title?.split(`,`)?.join(``))
};
data.description = $("#eow-description").html().split(`<br>`).join(`\n`);
data.relatedVideos = [];
for (let i in $(".video-list-item")) {
let obj = {};
obj.type = "video";
obj.id = $(".video-list-item > .content-wrapper > a")[i]?.attribs?.href?.split(`v=`)[1].split(`&`)[0];
if (!obj.id) continue;
obj.title = $(".video-list-item > .content-wrapper > a")[i]?.attribs?.title;
obj.uploader = {};
obj.uploader.id = $(".video-list-item > .content-wrapper > .stat.attribution > .g-hovercard")[i]?.attribs?.["data-ytid"];
obj.uploader.url = `https://www.youtube.com/channel/${obj.uploader.id}`;
obj.uploader.name = $(".video-list-item > .content-wrapper > .stat.attribution > .g-hovercard")[i]?.children?.[0]?.data;
obj.views = parseInt($(".video-list-item > .content-wrapper .stat.view-count")[i]?.children?.[0].data.split(`,`).join(``));
data.relatedVideos.push(obj);
}
data.views = parseInt($(".watch-view-count")[0]?.children?.[0]?.data?.split(`,`).join(``));
data.likes = parseInt($(".like-button-renderer-like-button > span")[0]?.children?.[0]?.data?.split(`,`).join(``));
data.dislikes = parseInt($(".like-button-renderer-dislike-button > span")[0]?.children?.[0]?.data?.split(`,`).join(``));
data.publishDate = {
year: parseInt($("[itemprop='datePublished']").attr("content")?.split(`-`)[0]),
month: parseInt($("[itemprop='datePublished']").attr("content")?.split(`-`)[1]),
day: parseInt($("[itemprop='datePublished']").attr("content")?.split(`-`)[2])
}
return data;
default:
console.log("cannot parse this page.")
return null;
}
}
function handleSPF(data, type) {
// parse spf swf config
let obj = {};
obj.type = "video";
switch(type) {
case "video":
// example: https://web.archive.org/web/20150819140026if_/https://www.youtube.com/watch?v=230Q5CDHH4Q
for (let i in data) {
if (data[i]?.body?.["watch7-container"]) {
obj.publishDate = {
year: parseInt(data[i]?.body?.["watch7-container"].split(`itemprop=\"datePublished\" content=\"`)?.[1]?.split(`\"`)?.[0]?.split(`-`)?.[0]),
month: parseInt(data[i]?.body?.["watch7-container"].split(`itemprop=\"datePublished\" content=\"`)?.[1]?.split(`\"`)?.[0]?.split(`-`)?.[1]),
day: parseInt(data[i]?.body?.["watch7-container"].split(`itemprop=\"datePublished\" content=\"`)?.[1]?.split(`\"`)?.[0]?.split(`-`)?.[2])
}
obj.likes = parseInt(data[i]?.body?.["watch7-container"].split(`aria-label=\"like this video along with `)?.[1]?.split(` other people\"`)?.[0]?.split(`,`).join(``));
obj.dislikes = parseInt(data[i]?.body?.["watch7-container"].split(`aria-label=\"dislike this video along with `)?.[1]?.split(` other people\"`)?.[0]?.split(`,`).join(``));
}
if (data[i]?.data?.swfcfg) {
obj.title = data[i].data.swfcfg?.args?.title;
obj.uploader = {
name: data[i].data.swfcfg?.args?.author,
id: data[i].data.swfcfg?.args?.ucid,
url: `https://www.youtube.com/channel/${data[i].data.swfcfg?.args?.ucid}`
}
obj.views = parseInt(data[i].data.swfcfg?.args?.view_count);
}
}
return obj;
default:
console.log("cannot parse this page.")
return null;
}
}