Add article scraping

This commit is contained in:
patrickkfkan 2021-02-07 19:44:57 +08:00
parent c03fd871c5
commit 07b2056bbf
6 changed files with 370 additions and 2 deletions

12
examples/getArticle.js Normal file
View File

@ -0,0 +1,12 @@
const bcfetch = require('../');
const util = require('util');
const articleUrl = 'https://daily.bandcamp.com/best-ambient/best-new-ambient-march-2018';
const options = {
includeRawData: false
};
bcfetch.getArticle(articleUrl, options).then( results => {
console.log(util.inspect(results, false, null, false));
});

View File

@ -0,0 +1,6 @@
const bcfetch = require('../');
const util = require('util');
bcfetch.getArticleCategories().then( results => {
console.log(util.inspect(results, false, null, false));
});

View File

@ -0,0 +1,12 @@
const bcfetch = require('../');
const util = require('util');
const params = {
categoryUrl: 'https://daily.bandcamp.com/best-ambient',
page: 2
}
bcfetch.getArticleList(params).then( results => {
console.log(util.inspect(results, false, null, false));
console.log('Number of articles fetched: ' + results.articles.length);
});

View File

@ -259,6 +259,34 @@ async function getShow(showUrl, options = {}) {
.then( html => parser.parseShow(html, opts) ); .then( html => parser.parseShow(html, opts) );
} }
async function getArticleCategories() {
return _fetchPage(utils.getDailyUrl())
.then( html => parser.parseArticleCategories(html) );
}
async function getArticleList(params = {}, options = {}) {
if (params.categoryUrl == undefined) {
params.categoryUrl = utils.getUrl('latest', utils.getDailyUrl());
}
const opts = {
imageFormat: await _parseImageFormatArg(options.imageFormat)
};
return _fetchPage(utils.getDailyUrl(params))
.then( html => parser.parseArticleList(html, opts) );
}
async function getArticle(articleUrl, options = {}) {
const imageConstants = await _getImageConstants();
const opts = {
imageBaseUrl: imageConstants.baseUrl,
albumImageFormat: await _parseImageFormatArg(options.albumImageFormat, 9),
artistImageFormat: await _parseImageFormatArg(options.artistImageFormat, 21),
includeRawData: options.includeRawData ? true : false
};
return _fetchPage(articleUrl)
.then( html => parser.parseArticle(html, opts) );
}
async function _fetchPage(url, json = false) { async function _fetchPage(url, json = false) {
return _cache.getOrSet('page', url + (json ? ':json' : ':html'), () => { return _cache.getOrSet('page', url + (json ? ':json' : ':html'), () => {
return fetch(url).then( res => json ? res.json() : res.text() ); return fetch(url).then( res => json ? res.json() : res.text() );
@ -291,4 +319,7 @@ module.exports = {
cache, cache,
getAllShows, getAllShows,
getShow, getShow,
getArticleCategories,
getArticleList,
getArticle
}; };

View File

@ -1,7 +1,7 @@
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const {decode} = require('html-entities'); const {decode} = require('html-entities');
const utils = require('./utils.js'); const utils = require('./utils.js');
const {URL} = require('url'); const {EOL} = require('os');
// https://github.com/masterT/bandcamp-scraper/blob/master/lib/htmlParser.js // https://github.com/masterT/bandcamp-scraper/blob/master/lib/htmlParser.js
function assignProps(objFrom, objTo, propNames) { function assignProps(objFrom, objTo, propNames) {
@ -702,6 +702,301 @@ function parseShow(html, opts) {
return null; return null;
} }
function parseArticleCategories(html) {
const $ = cheerio.load(html);
const dailyUrl = utils.getDailyUrl();
const _parseSection = (section) => {
const h = section.prev('h2');
const title = h.length ? h.text() : '';
const s = {
name: section.attr('class'),
title,
sections: [],
categories: []
}
section.children().each( (i, c) => {
const tag = c.tagName;
c = $(c);
if (tag === 'section') {
const parsed = _parseSection($(c));
if (parsed !== null) {
s.sections.push(parsed);
}
}
else if (tag === 'div') {
c.find('a').each( (i, a) => {
a = $(a);
let url = a.attr('href');
if (!utils.isAbsoluteUrl(url)) {
url = utils.getUrl(url, dailyUrl);
}
s.categories.push({
url,
name: a.text()
});
});
}
});
if (s.sections.length === 0) {
delete s.sections;
}
if (s.categories.length === 0) {
delete s.categories;
}
if (!s.sections && !s.categories) {
return null;
}
else {
return s;
}
};
const sections = $('#daily-view-all').children('section');
const results = [];
sections.each( (i, section) => {
const parsed = _parseSection($(section));
if (parsed !== null) {
results.push(parsed);
}
});
return results;
}
function parseArticleList(html, opts) {
const $ = cheerio.load(html);
const dailyUrl = utils.getDailyUrl();
const results = {
articles: [],
total: 0,
start: 0,
end: 0
};
$('articles-list').each( (i, list) => {
$('.list-article', $(list)).each( (i, article) => {
article = $(article);
const imageUrl = article.find('img').attr('src') || null;
// category
const infoText = article.find('.article-info-text');
const infoTextCategoryLink = infoText.find('a.franchise');
const infoTextMiddot = infoText.find('.middot');
const category = {
url: infoTextCategoryLink.attr('href') || null,
name: infoTextCategoryLink.text() || ''
};
if (!utils.isAbsoluteUrl(category.url)) {
category.url = utils.getUrl(category.url, dailyUrl);
}
// date
infoTextCategoryLink.remove();
infoTextMiddot.remove();
const date = utils.stripLineBreaks(infoText.text()).trim();
// title and url
const titleLink = article.find('a.title');
const title = titleLink.text();
let url = titleLink.attr('href');
if (!utils.isAbsoluteUrl(url)) {
url = utils.getUrl(url, dailyUrl);
}
if (titleLink) {
results.articles.push({
url,
title,
date,
imageUrl: utils.reformatImageUrl(imageUrl, opts.imageFormat),
category,
});
}
});
});
const resultsText = utils.stripLineBreaks($('#num-results').text()).trim();
const rtm = resultsText.match(/(\d+)(?:\s*to\s*)(\d+)(?:\s*of\s*)(\d+)/);
if (rtm.length === 4) {
results.total = parseInt(rtm[3], 10);
results.start = parseInt(rtm[1], 10);
results.end = parseInt(rtm[2], 10);
}
return results;
}
function parseArticle(html, opts) {
const $ = cheerio.load(html);
const basic = JSON.parse($('script[type="application/ld+json"]').html());
const players = JSON.parse(decode($('#p-daily-article').attr('data-player-infos')));
const article = {
title: basic.headline,
description: basic.description,
url: basic['@id'],
imageUrl: basic.image,
date: basic.datePublished,
category: {
name: basic.articleSection,
url: null
},
genre: null,
author: {
name: basic.author.name,
url: basic.author['@id']
},
mediaItems: [],
sections: {}
};
// get genre
const genreLink = $('.genre a');
if (genreLink.length > 0) {
article.genre = {
name: genreLink.text(),
url: genreLink.attr('href')
};
const genreReadMoreLink = $('.moreingenre a');
if (genreReadMoreLink.length > 0) {
article.genre.readMoreUrl = genreReadMoreLink.attr('href');
if (!utils.isAbsoluteUrl(article.genre.readMoreUrl)) {
article.genre.readMoreUrl = utils.getUrl(article.genre.readMoreUrl, utils.getDailyUrl());
}
}
}
// get category url
const categoryLink = $('article-type a');
if (categoryLink.length > 0) {
article.category.url = categoryLink.attr('href');
if (!utils.isAbsoluteUrl(article.category.url)) {
article.category.url = utils.getUrl(article.category.url, utils.getDailyUrl());
}
}
// get media items (albums and tracks featured in article)
if (Array.isArray(players)) {
players.forEach( player => {
const mediaItem = {
type: 'unknown',
name: player.title,
url: player.tralbum_url,
imageUrl: '',
featuredTrackPosition: player.featured_track,
artist: {
name: player.band_name,
url: player.band_url,
imageUrl: '',
location: player.band_location
},
tracks: [],
mediaItemRef: player.player_id
};
if (player.parent_tralbum_type === 'a') {
mediaItem.type = 'album';
}
else if (player.parent_tralbum_type === 't') {
mediaItem.type = 'track';
}
if (player.art_id) {
mediaItem.imageUrl = opts.imageBaseUrl + '/img/a' + player.art_id + '_' + opts.albumImageFormat.id + '.jpg';
}
if (player.band_image_id) {
mediaItem.artist.imageUrl = opts.imageBaseUrl + '/img/' + player.band_image_id + '_' + opts.artistImageFormat.id + '.jpg';
}
if (Array.isArray(player.tracklist)) {
player.tracklist.forEach( trackInfo => {
const track = {
position: trackInfo.track_number,
name: trackInfo.track_title,
duration: trackInfo.audio_track_duration,
streamUrl: trackInfo.audio_url['mp3-128']
}
mediaItem.tracks.push(track);
});
}
article.mediaItems.push(mediaItem);
});
}
// Function that returns a section corresponding to a media item
const _getSectionByPlayer = player => {
const section = {
heading: null,
html: '',
text: '',
mediaItemRef: null
};
// Get heading
const heading = player.prevUntil('bamplayer-art', 'h3, h2').first();
if (heading.length > 0) {
section.heading = {
html: heading.html(),
text: utils.stripTags(utils.brToNewLine(heading.html())).trim()
};
}
// Get html and text
const paragraphs = player.nextUntil('bamplayer-art, h3, h5, article-end', 'p');
paragraphs.each( (i, p) => {
p = $(p);
section.html += (section.html !== '' ? EOL : '') + p.html();
section.text += (section.text !== '' ? EOL + EOL : '') + p.text();
});
// get mediaItemRef
const playerIdMatch = player.attr('data-bind').match(/playerMap\["(.+?)"]/);
section.mediaItemRef = playerIdMatch[1] || null;
return section;
}
// Function that returns the introductory paragraph(s) of the article
const _getIntroSection = articleBody => {
const firstPlayer = articleBody.find('bamplayer-art').first();
const paragraphs = firstPlayer.length > 0 ? firstPlayer.prevAll('p') : articleBody.find('p');
if (paragraphs.length > 0) {
const section = {
html: '',
text: ''
};
paragraphs.each( (i, p) => {
p = $(p);
section.html += (section.html !== '' ? EOL : '') + p.html();
section.text += (section.text !== '' ? EOL + EOL : '') + p.text();
});
return section;
}
else {
return null;
}
}
// sections
const articleBody = $('#p-daily-article article');
const sections = [];
const introSection = _getIntroSection(articleBody);
if (introSection) {
sections.push(introSection);
}
const bcplayers = articleBody.find('bamplayer-art');
bcplayers.each( (i, player) => {
sections.push(_getSectionByPlayer($(player)));
});
article.sections = sections;
if (opts.includeRawData) {
article.raw = {
basic,
mediaItems: players,
body: articleBody.html()
};
}
return article;
}
module.exports = { module.exports = {
parseDiscoverResults, parseDiscoverResults,
parseDiscoverOptions, parseDiscoverOptions,
@ -716,4 +1011,7 @@ module.exports = {
parseTags, parseTags,
parseAllShows, parseAllShows,
parseShow, parseShow,
parseArticleCategories,
parseArticleList,
parseArticle
}; };

View File

@ -132,6 +132,14 @@ function getShowUrl(showId) {
return getSiteUrl() + '/?show=' + showId; return getSiteUrl() + '/?show=' + showId;
} }
function getDailyUrl(params = {}) {
let url = params.categoryUrl || 'https://daily.bandcamp.com';
if (params.page) {
url += '?page=' + params.page;
}
return url;
}
module.exports = { module.exports = {
getUrl, getUrl,
getSiteUrl, getSiteUrl,
@ -148,5 +156,6 @@ module.exports = {
isAbsoluteUrl, isAbsoluteUrl,
getAllShowsUrl, getAllShowsUrl,
getShowIdFromUrl, getShowIdFromUrl,
getShowUrl getShowUrl,
getDailyUrl
}; };