Add article scraping
This commit is contained in:
parent
c03fd871c5
commit
07b2056bbf
12
examples/getArticle.js
Normal file
12
examples/getArticle.js
Normal file
|
@ -0,0 +1,12 @@
|
|||
const bcfetch = require('../');
|
||||
const util = require('util');
|
||||
|
||||
const articleUrl = 'https://daily.bandcamp.com/best-ambient/best-new-ambient-march-2018';
|
||||
|
||||
const options = {
|
||||
includeRawData: false
|
||||
};
|
||||
|
||||
bcfetch.getArticle(articleUrl, options).then( results => {
|
||||
console.log(util.inspect(results, false, null, false));
|
||||
});
|
6
examples/getArticleCategories.js
Normal file
6
examples/getArticleCategories.js
Normal file
|
@ -0,0 +1,6 @@
|
|||
const bcfetch = require('../');
|
||||
const util = require('util');
|
||||
|
||||
bcfetch.getArticleCategories().then( results => {
|
||||
console.log(util.inspect(results, false, null, false));
|
||||
});
|
12
examples/getArticleList.js
Normal file
12
examples/getArticleList.js
Normal file
|
@ -0,0 +1,12 @@
|
|||
const bcfetch = require('../');
|
||||
const util = require('util');
|
||||
|
||||
const params = {
|
||||
categoryUrl: 'https://daily.bandcamp.com/best-ambient',
|
||||
page: 2
|
||||
}
|
||||
|
||||
bcfetch.getArticleList(params).then( results => {
|
||||
console.log(util.inspect(results, false, null, false));
|
||||
console.log('Number of articles fetched: ' + results.articles.length);
|
||||
});
|
31
lib/index.js
31
lib/index.js
|
@ -259,6 +259,34 @@ async function getShow(showUrl, options = {}) {
|
|||
.then( html => parser.parseShow(html, opts) );
|
||||
}
|
||||
|
||||
async function getArticleCategories() {
|
||||
return _fetchPage(utils.getDailyUrl())
|
||||
.then( html => parser.parseArticleCategories(html) );
|
||||
}
|
||||
|
||||
async function getArticleList(params = {}, options = {}) {
|
||||
if (params.categoryUrl == undefined) {
|
||||
params.categoryUrl = utils.getUrl('latest', utils.getDailyUrl());
|
||||
}
|
||||
const opts = {
|
||||
imageFormat: await _parseImageFormatArg(options.imageFormat)
|
||||
};
|
||||
return _fetchPage(utils.getDailyUrl(params))
|
||||
.then( html => parser.parseArticleList(html, opts) );
|
||||
}
|
||||
|
||||
async function getArticle(articleUrl, options = {}) {
|
||||
const imageConstants = await _getImageConstants();
|
||||
const opts = {
|
||||
imageBaseUrl: imageConstants.baseUrl,
|
||||
albumImageFormat: await _parseImageFormatArg(options.albumImageFormat, 9),
|
||||
artistImageFormat: await _parseImageFormatArg(options.artistImageFormat, 21),
|
||||
includeRawData: options.includeRawData ? true : false
|
||||
};
|
||||
return _fetchPage(articleUrl)
|
||||
.then( html => parser.parseArticle(html, opts) );
|
||||
}
|
||||
|
||||
async function _fetchPage(url, json = false) {
|
||||
return _cache.getOrSet('page', url + (json ? ':json' : ':html'), () => {
|
||||
return fetch(url).then( res => json ? res.json() : res.text() );
|
||||
|
@ -291,4 +319,7 @@ module.exports = {
|
|||
cache,
|
||||
getAllShows,
|
||||
getShow,
|
||||
getArticleCategories,
|
||||
getArticleList,
|
||||
getArticle
|
||||
};
|
300
lib/parser.js
300
lib/parser.js
|
@ -1,7 +1,7 @@
|
|||
const cheerio = require('cheerio');
|
||||
const {decode} = require('html-entities');
|
||||
const utils = require('./utils.js');
|
||||
const {URL} = require('url');
|
||||
const {EOL} = require('os');
|
||||
|
||||
// https://github.com/masterT/bandcamp-scraper/blob/master/lib/htmlParser.js
|
||||
function assignProps(objFrom, objTo, propNames) {
|
||||
|
@ -702,6 +702,301 @@ function parseShow(html, opts) {
|
|||
return null;
|
||||
}
|
||||
|
||||
function parseArticleCategories(html) {
|
||||
const $ = cheerio.load(html);
|
||||
const dailyUrl = utils.getDailyUrl();
|
||||
const _parseSection = (section) => {
|
||||
const h = section.prev('h2');
|
||||
const title = h.length ? h.text() : '';
|
||||
const s = {
|
||||
name: section.attr('class'),
|
||||
title,
|
||||
sections: [],
|
||||
categories: []
|
||||
}
|
||||
section.children().each( (i, c) => {
|
||||
const tag = c.tagName;
|
||||
c = $(c);
|
||||
if (tag === 'section') {
|
||||
const parsed = _parseSection($(c));
|
||||
if (parsed !== null) {
|
||||
s.sections.push(parsed);
|
||||
}
|
||||
}
|
||||
else if (tag === 'div') {
|
||||
c.find('a').each( (i, a) => {
|
||||
a = $(a);
|
||||
let url = a.attr('href');
|
||||
if (!utils.isAbsoluteUrl(url)) {
|
||||
url = utils.getUrl(url, dailyUrl);
|
||||
}
|
||||
s.categories.push({
|
||||
url,
|
||||
name: a.text()
|
||||
});
|
||||
});
|
||||
|
||||
}
|
||||
});
|
||||
if (s.sections.length === 0) {
|
||||
delete s.sections;
|
||||
}
|
||||
if (s.categories.length === 0) {
|
||||
delete s.categories;
|
||||
}
|
||||
if (!s.sections && !s.categories) {
|
||||
return null;
|
||||
}
|
||||
else {
|
||||
return s;
|
||||
}
|
||||
};
|
||||
|
||||
const sections = $('#daily-view-all').children('section');
|
||||
const results = [];
|
||||
sections.each( (i, section) => {
|
||||
const parsed = _parseSection($(section));
|
||||
if (parsed !== null) {
|
||||
results.push(parsed);
|
||||
}
|
||||
});
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function parseArticleList(html, opts) {
|
||||
const $ = cheerio.load(html);
|
||||
const dailyUrl = utils.getDailyUrl();
|
||||
const results = {
|
||||
articles: [],
|
||||
total: 0,
|
||||
start: 0,
|
||||
end: 0
|
||||
};
|
||||
|
||||
$('articles-list').each( (i, list) => {
|
||||
$('.list-article', $(list)).each( (i, article) => {
|
||||
article = $(article);
|
||||
const imageUrl = article.find('img').attr('src') || null;
|
||||
// category
|
||||
const infoText = article.find('.article-info-text');
|
||||
const infoTextCategoryLink = infoText.find('a.franchise');
|
||||
const infoTextMiddot = infoText.find('.middot');
|
||||
const category = {
|
||||
url: infoTextCategoryLink.attr('href') || null,
|
||||
name: infoTextCategoryLink.text() || ''
|
||||
};
|
||||
if (!utils.isAbsoluteUrl(category.url)) {
|
||||
category.url = utils.getUrl(category.url, dailyUrl);
|
||||
}
|
||||
// date
|
||||
infoTextCategoryLink.remove();
|
||||
infoTextMiddot.remove();
|
||||
const date = utils.stripLineBreaks(infoText.text()).trim();
|
||||
// title and url
|
||||
const titleLink = article.find('a.title');
|
||||
const title = titleLink.text();
|
||||
let url = titleLink.attr('href');
|
||||
if (!utils.isAbsoluteUrl(url)) {
|
||||
url = utils.getUrl(url, dailyUrl);
|
||||
}
|
||||
|
||||
if (titleLink) {
|
||||
results.articles.push({
|
||||
url,
|
||||
title,
|
||||
date,
|
||||
imageUrl: utils.reformatImageUrl(imageUrl, opts.imageFormat),
|
||||
category,
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
const resultsText = utils.stripLineBreaks($('#num-results').text()).trim();
|
||||
const rtm = resultsText.match(/(\d+)(?:\s*to\s*)(\d+)(?:\s*of\s*)(\d+)/);
|
||||
if (rtm.length === 4) {
|
||||
results.total = parseInt(rtm[3], 10);
|
||||
results.start = parseInt(rtm[1], 10);
|
||||
results.end = parseInt(rtm[2], 10);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
function parseArticle(html, opts) {
|
||||
const $ = cheerio.load(html);
|
||||
const basic = JSON.parse($('script[type="application/ld+json"]').html());
|
||||
const players = JSON.parse(decode($('#p-daily-article').attr('data-player-infos')));
|
||||
|
||||
const article = {
|
||||
title: basic.headline,
|
||||
description: basic.description,
|
||||
url: basic['@id'],
|
||||
imageUrl: basic.image,
|
||||
date: basic.datePublished,
|
||||
category: {
|
||||
name: basic.articleSection,
|
||||
url: null
|
||||
},
|
||||
genre: null,
|
||||
author: {
|
||||
name: basic.author.name,
|
||||
url: basic.author['@id']
|
||||
},
|
||||
mediaItems: [],
|
||||
sections: {}
|
||||
};
|
||||
|
||||
// get genre
|
||||
const genreLink = $('.genre a');
|
||||
if (genreLink.length > 0) {
|
||||
article.genre = {
|
||||
name: genreLink.text(),
|
||||
url: genreLink.attr('href')
|
||||
};
|
||||
|
||||
const genreReadMoreLink = $('.moreingenre a');
|
||||
if (genreReadMoreLink.length > 0) {
|
||||
article.genre.readMoreUrl = genreReadMoreLink.attr('href');
|
||||
if (!utils.isAbsoluteUrl(article.genre.readMoreUrl)) {
|
||||
article.genre.readMoreUrl = utils.getUrl(article.genre.readMoreUrl, utils.getDailyUrl());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// get category url
|
||||
const categoryLink = $('article-type a');
|
||||
if (categoryLink.length > 0) {
|
||||
article.category.url = categoryLink.attr('href');
|
||||
if (!utils.isAbsoluteUrl(article.category.url)) {
|
||||
article.category.url = utils.getUrl(article.category.url, utils.getDailyUrl());
|
||||
}
|
||||
}
|
||||
|
||||
// get media items (albums and tracks featured in article)
|
||||
if (Array.isArray(players)) {
|
||||
players.forEach( player => {
|
||||
const mediaItem = {
|
||||
type: 'unknown',
|
||||
name: player.title,
|
||||
url: player.tralbum_url,
|
||||
imageUrl: '',
|
||||
featuredTrackPosition: player.featured_track,
|
||||
artist: {
|
||||
name: player.band_name,
|
||||
url: player.band_url,
|
||||
imageUrl: '',
|
||||
location: player.band_location
|
||||
},
|
||||
tracks: [],
|
||||
mediaItemRef: player.player_id
|
||||
};
|
||||
if (player.parent_tralbum_type === 'a') {
|
||||
mediaItem.type = 'album';
|
||||
}
|
||||
else if (player.parent_tralbum_type === 't') {
|
||||
mediaItem.type = 'track';
|
||||
}
|
||||
if (player.art_id) {
|
||||
mediaItem.imageUrl = opts.imageBaseUrl + '/img/a' + player.art_id + '_' + opts.albumImageFormat.id + '.jpg';
|
||||
}
|
||||
if (player.band_image_id) {
|
||||
mediaItem.artist.imageUrl = opts.imageBaseUrl + '/img/' + player.band_image_id + '_' + opts.artistImageFormat.id + '.jpg';
|
||||
}
|
||||
if (Array.isArray(player.tracklist)) {
|
||||
player.tracklist.forEach( trackInfo => {
|
||||
const track = {
|
||||
position: trackInfo.track_number,
|
||||
name: trackInfo.track_title,
|
||||
duration: trackInfo.audio_track_duration,
|
||||
streamUrl: trackInfo.audio_url['mp3-128']
|
||||
}
|
||||
mediaItem.tracks.push(track);
|
||||
});
|
||||
}
|
||||
|
||||
article.mediaItems.push(mediaItem);
|
||||
});
|
||||
}
|
||||
|
||||
// Function that returns a section corresponding to a media item
|
||||
const _getSectionByPlayer = player => {
|
||||
const section = {
|
||||
heading: null,
|
||||
html: '',
|
||||
text: '',
|
||||
mediaItemRef: null
|
||||
};
|
||||
|
||||
// Get heading
|
||||
const heading = player.prevUntil('bamplayer-art', 'h3, h2').first();
|
||||
if (heading.length > 0) {
|
||||
section.heading = {
|
||||
html: heading.html(),
|
||||
text: utils.stripTags(utils.brToNewLine(heading.html())).trim()
|
||||
};
|
||||
}
|
||||
|
||||
// Get html and text
|
||||
const paragraphs = player.nextUntil('bamplayer-art, h3, h5, article-end', 'p');
|
||||
paragraphs.each( (i, p) => {
|
||||
p = $(p);
|
||||
section.html += (section.html !== '' ? EOL : '') + p.html();
|
||||
section.text += (section.text !== '' ? EOL + EOL : '') + p.text();
|
||||
});
|
||||
|
||||
// get mediaItemRef
|
||||
const playerIdMatch = player.attr('data-bind').match(/playerMap\["(.+?)"]/);
|
||||
section.mediaItemRef = playerIdMatch[1] || null;
|
||||
|
||||
return section;
|
||||
}
|
||||
|
||||
// Function that returns the introductory paragraph(s) of the article
|
||||
const _getIntroSection = articleBody => {
|
||||
const firstPlayer = articleBody.find('bamplayer-art').first();
|
||||
const paragraphs = firstPlayer.length > 0 ? firstPlayer.prevAll('p') : articleBody.find('p');
|
||||
if (paragraphs.length > 0) {
|
||||
const section = {
|
||||
html: '',
|
||||
text: ''
|
||||
};
|
||||
paragraphs.each( (i, p) => {
|
||||
p = $(p);
|
||||
section.html += (section.html !== '' ? EOL : '') + p.html();
|
||||
section.text += (section.text !== '' ? EOL + EOL : '') + p.text();
|
||||
});
|
||||
return section;
|
||||
}
|
||||
else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// sections
|
||||
const articleBody = $('#p-daily-article article');
|
||||
const sections = [];
|
||||
const introSection = _getIntroSection(articleBody);
|
||||
if (introSection) {
|
||||
sections.push(introSection);
|
||||
}
|
||||
const bcplayers = articleBody.find('bamplayer-art');
|
||||
bcplayers.each( (i, player) => {
|
||||
sections.push(_getSectionByPlayer($(player)));
|
||||
});
|
||||
article.sections = sections;
|
||||
|
||||
if (opts.includeRawData) {
|
||||
article.raw = {
|
||||
basic,
|
||||
mediaItems: players,
|
||||
body: articleBody.html()
|
||||
};
|
||||
}
|
||||
|
||||
return article;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
parseDiscoverResults,
|
||||
parseDiscoverOptions,
|
||||
|
@ -716,4 +1011,7 @@ module.exports = {
|
|||
parseTags,
|
||||
parseAllShows,
|
||||
parseShow,
|
||||
parseArticleCategories,
|
||||
parseArticleList,
|
||||
parseArticle
|
||||
};
|
11
lib/utils.js
11
lib/utils.js
|
@ -132,6 +132,14 @@ function getShowUrl(showId) {
|
|||
return getSiteUrl() + '/?show=' + showId;
|
||||
}
|
||||
|
||||
function getDailyUrl(params = {}) {
|
||||
let url = params.categoryUrl || 'https://daily.bandcamp.com';
|
||||
if (params.page) {
|
||||
url += '?page=' + params.page;
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getUrl,
|
||||
getSiteUrl,
|
||||
|
@ -148,5 +156,6 @@ module.exports = {
|
|||
isAbsoluteUrl,
|
||||
getAllShowsUrl,
|
||||
getShowIdFromUrl,
|
||||
getShowUrl
|
||||
getShowUrl,
|
||||
getDailyUrl
|
||||
};
|
Loading…
Reference in New Issue
Block a user