diff --git a/examples/getArticle.js b/examples/getArticle.js new file mode 100644 index 0000000..9aab022 --- /dev/null +++ b/examples/getArticle.js @@ -0,0 +1,12 @@ +const bcfetch = require('../'); +const util = require('util'); + +const articleUrl = 'https://daily.bandcamp.com/best-ambient/best-new-ambient-march-2018'; + +const options = { + includeRawData: false +}; + +bcfetch.getArticle(articleUrl, options).then( results => { + console.log(util.inspect(results, false, null, false)); +}); \ No newline at end of file diff --git a/examples/getArticleCategories.js b/examples/getArticleCategories.js new file mode 100644 index 0000000..18cc17c --- /dev/null +++ b/examples/getArticleCategories.js @@ -0,0 +1,6 @@ +const bcfetch = require('../'); +const util = require('util'); + +bcfetch.getArticleCategories().then( results => { + console.log(util.inspect(results, false, null, false)); +}); \ No newline at end of file diff --git a/examples/getArticleList.js b/examples/getArticleList.js new file mode 100644 index 0000000..e9a191a --- /dev/null +++ b/examples/getArticleList.js @@ -0,0 +1,12 @@ +const bcfetch = require('../'); +const util = require('util'); + +const params = { + categoryUrl: 'https://daily.bandcamp.com/best-ambient', + page: 2 +} + +bcfetch.getArticleList(params).then( results => { + console.log(util.inspect(results, false, null, false)); + console.log('Number of articles fetched: ' + results.articles.length); +}); \ No newline at end of file diff --git a/lib/index.js b/lib/index.js index 5583e6b..556e82a 100644 --- a/lib/index.js +++ b/lib/index.js @@ -259,6 +259,34 @@ async function getShow(showUrl, options = {}) { .then( html => parser.parseShow(html, opts) ); } +async function getArticleCategories() { + return _fetchPage(utils.getDailyUrl()) + .then( html => parser.parseArticleCategories(html) ); +} + +async function getArticleList(params = {}, options = {}) { + if (params.categoryUrl == undefined) { + params.categoryUrl = utils.getUrl('latest', utils.getDailyUrl()); + } + const opts = { + imageFormat: await _parseImageFormatArg(options.imageFormat) + }; + return _fetchPage(utils.getDailyUrl(params)) + .then( html => parser.parseArticleList(html, opts) ); +} + +async function getArticle(articleUrl, options = {}) { + const imageConstants = await _getImageConstants(); + const opts = { + imageBaseUrl: imageConstants.baseUrl, + albumImageFormat: await _parseImageFormatArg(options.albumImageFormat, 9), + artistImageFormat: await _parseImageFormatArg(options.artistImageFormat, 21), + includeRawData: options.includeRawData ? true : false + }; + return _fetchPage(articleUrl) + .then( html => parser.parseArticle(html, opts) ); +} + async function _fetchPage(url, json = false) { return _cache.getOrSet('page', url + (json ? ':json' : ':html'), () => { return fetch(url).then( res => json ? res.json() : res.text() ); @@ -291,4 +319,7 @@ module.exports = { cache, getAllShows, getShow, + getArticleCategories, + getArticleList, + getArticle }; \ No newline at end of file diff --git a/lib/parser.js b/lib/parser.js index 98e1812..4743bc4 100644 --- a/lib/parser.js +++ b/lib/parser.js @@ -1,7 +1,7 @@ const cheerio = require('cheerio'); const {decode} = require('html-entities'); const utils = require('./utils.js'); -const {URL} = require('url'); +const {EOL} = require('os'); // https://github.com/masterT/bandcamp-scraper/blob/master/lib/htmlParser.js function assignProps(objFrom, objTo, propNames) { @@ -702,6 +702,301 @@ function parseShow(html, opts) { return null; } +function parseArticleCategories(html) { + const $ = cheerio.load(html); + const dailyUrl = utils.getDailyUrl(); + const _parseSection = (section) => { + const h = section.prev('h2'); + const title = h.length ? h.text() : ''; + const s = { + name: section.attr('class'), + title, + sections: [], + categories: [] + } + section.children().each( (i, c) => { + const tag = c.tagName; + c = $(c); + if (tag === 'section') { + const parsed = _parseSection($(c)); + if (parsed !== null) { + s.sections.push(parsed); + } + } + else if (tag === 'div') { + c.find('a').each( (i, a) => { + a = $(a); + let url = a.attr('href'); + if (!utils.isAbsoluteUrl(url)) { + url = utils.getUrl(url, dailyUrl); + } + s.categories.push({ + url, + name: a.text() + }); + }); + + } + }); + if (s.sections.length === 0) { + delete s.sections; + } + if (s.categories.length === 0) { + delete s.categories; + } + if (!s.sections && !s.categories) { + return null; + } + else { + return s; + } + }; + + const sections = $('#daily-view-all').children('section'); + const results = []; + sections.each( (i, section) => { + const parsed = _parseSection($(section)); + if (parsed !== null) { + results.push(parsed); + } + }); + + return results; +} + +function parseArticleList(html, opts) { + const $ = cheerio.load(html); + const dailyUrl = utils.getDailyUrl(); + const results = { + articles: [], + total: 0, + start: 0, + end: 0 + }; + + $('articles-list').each( (i, list) => { + $('.list-article', $(list)).each( (i, article) => { + article = $(article); + const imageUrl = article.find('img').attr('src') || null; + // category + const infoText = article.find('.article-info-text'); + const infoTextCategoryLink = infoText.find('a.franchise'); + const infoTextMiddot = infoText.find('.middot'); + const category = { + url: infoTextCategoryLink.attr('href') || null, + name: infoTextCategoryLink.text() || '' + }; + if (!utils.isAbsoluteUrl(category.url)) { + category.url = utils.getUrl(category.url, dailyUrl); + } + // date + infoTextCategoryLink.remove(); + infoTextMiddot.remove(); + const date = utils.stripLineBreaks(infoText.text()).trim(); + // title and url + const titleLink = article.find('a.title'); + const title = titleLink.text(); + let url = titleLink.attr('href'); + if (!utils.isAbsoluteUrl(url)) { + url = utils.getUrl(url, dailyUrl); + } + + if (titleLink) { + results.articles.push({ + url, + title, + date, + imageUrl: utils.reformatImageUrl(imageUrl, opts.imageFormat), + category, + }); + } + }); + }); + + const resultsText = utils.stripLineBreaks($('#num-results').text()).trim(); + const rtm = resultsText.match(/(\d+)(?:\s*to\s*)(\d+)(?:\s*of\s*)(\d+)/); + if (rtm.length === 4) { + results.total = parseInt(rtm[3], 10); + results.start = parseInt(rtm[1], 10); + results.end = parseInt(rtm[2], 10); + } + return results; +} + +function parseArticle(html, opts) { + const $ = cheerio.load(html); + const basic = JSON.parse($('script[type="application/ld+json"]').html()); + const players = JSON.parse(decode($('#p-daily-article').attr('data-player-infos'))); + + const article = { + title: basic.headline, + description: basic.description, + url: basic['@id'], + imageUrl: basic.image, + date: basic.datePublished, + category: { + name: basic.articleSection, + url: null + }, + genre: null, + author: { + name: basic.author.name, + url: basic.author['@id'] + }, + mediaItems: [], + sections: {} + }; + + // get genre + const genreLink = $('.genre a'); + if (genreLink.length > 0) { + article.genre = { + name: genreLink.text(), + url: genreLink.attr('href') + }; + + const genreReadMoreLink = $('.moreingenre a'); + if (genreReadMoreLink.length > 0) { + article.genre.readMoreUrl = genreReadMoreLink.attr('href'); + if (!utils.isAbsoluteUrl(article.genre.readMoreUrl)) { + article.genre.readMoreUrl = utils.getUrl(article.genre.readMoreUrl, utils.getDailyUrl()); + } + } + } + + // get category url + const categoryLink = $('article-type a'); + if (categoryLink.length > 0) { + article.category.url = categoryLink.attr('href'); + if (!utils.isAbsoluteUrl(article.category.url)) { + article.category.url = utils.getUrl(article.category.url, utils.getDailyUrl()); + } + } + + // get media items (albums and tracks featured in article) + if (Array.isArray(players)) { + players.forEach( player => { + const mediaItem = { + type: 'unknown', + name: player.title, + url: player.tralbum_url, + imageUrl: '', + featuredTrackPosition: player.featured_track, + artist: { + name: player.band_name, + url: player.band_url, + imageUrl: '', + location: player.band_location + }, + tracks: [], + mediaItemRef: player.player_id + }; + if (player.parent_tralbum_type === 'a') { + mediaItem.type = 'album'; + } + else if (player.parent_tralbum_type === 't') { + mediaItem.type = 'track'; + } + if (player.art_id) { + mediaItem.imageUrl = opts.imageBaseUrl + '/img/a' + player.art_id + '_' + opts.albumImageFormat.id + '.jpg'; + } + if (player.band_image_id) { + mediaItem.artist.imageUrl = opts.imageBaseUrl + '/img/' + player.band_image_id + '_' + opts.artistImageFormat.id + '.jpg'; + } + if (Array.isArray(player.tracklist)) { + player.tracklist.forEach( trackInfo => { + const track = { + position: trackInfo.track_number, + name: trackInfo.track_title, + duration: trackInfo.audio_track_duration, + streamUrl: trackInfo.audio_url['mp3-128'] + } + mediaItem.tracks.push(track); + }); + } + + article.mediaItems.push(mediaItem); + }); + } + + // Function that returns a section corresponding to a media item + const _getSectionByPlayer = player => { + const section = { + heading: null, + html: '', + text: '', + mediaItemRef: null + }; + + // Get heading + const heading = player.prevUntil('bamplayer-art', 'h3, h2').first(); + if (heading.length > 0) { + section.heading = { + html: heading.html(), + text: utils.stripTags(utils.brToNewLine(heading.html())).trim() + }; + } + + // Get html and text + const paragraphs = player.nextUntil('bamplayer-art, h3, h5, article-end', 'p'); + paragraphs.each( (i, p) => { + p = $(p); + section.html += (section.html !== '' ? EOL : '') + p.html(); + section.text += (section.text !== '' ? EOL + EOL : '') + p.text(); + }); + + // get mediaItemRef + const playerIdMatch = player.attr('data-bind').match(/playerMap\["(.+?)"]/); + section.mediaItemRef = playerIdMatch[1] || null; + + return section; + } + + // Function that returns the introductory paragraph(s) of the article + const _getIntroSection = articleBody => { + const firstPlayer = articleBody.find('bamplayer-art').first(); + const paragraphs = firstPlayer.length > 0 ? firstPlayer.prevAll('p') : articleBody.find('p'); + if (paragraphs.length > 0) { + const section = { + html: '', + text: '' + }; + paragraphs.each( (i, p) => { + p = $(p); + section.html += (section.html !== '' ? EOL : '') + p.html(); + section.text += (section.text !== '' ? EOL + EOL : '') + p.text(); + }); + return section; + } + else { + return null; + } + } + + // sections + const articleBody = $('#p-daily-article article'); + const sections = []; + const introSection = _getIntroSection(articleBody); + if (introSection) { + sections.push(introSection); + } + const bcplayers = articleBody.find('bamplayer-art'); + bcplayers.each( (i, player) => { + sections.push(_getSectionByPlayer($(player))); + }); + article.sections = sections; + + if (opts.includeRawData) { + article.raw = { + basic, + mediaItems: players, + body: articleBody.html() + }; + } + + return article; +} + module.exports = { parseDiscoverResults, parseDiscoverOptions, @@ -716,4 +1011,7 @@ module.exports = { parseTags, parseAllShows, parseShow, + parseArticleCategories, + parseArticleList, + parseArticle }; \ No newline at end of file diff --git a/lib/utils.js b/lib/utils.js index 1b9e7ab..15b2ac8 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -132,6 +132,14 @@ function getShowUrl(showId) { return getSiteUrl() + '/?show=' + showId; } +function getDailyUrl(params = {}) { + let url = params.categoryUrl || 'https://daily.bandcamp.com'; + if (params.page) { + url += '?page=' + params.page; + } + return url; +} + module.exports = { getUrl, getSiteUrl, @@ -148,5 +156,6 @@ module.exports = { isAbsoluteUrl, getAllShowsUrl, getShowIdFromUrl, - getShowUrl + getShowUrl, + getDailyUrl }; \ No newline at end of file