/* global jQuery:true */
/* exported jQuery */
var http = require('./http')
var util = require('./util')
var URI = require('urijs')
var $ = require('jquery')
jQuery = $
/**
* Create a web scraper.
* @constructor
* @param {string} uri - The URI of the web page to scrape.
*/
function WebScraper (uri, count, parser) {
/**
* Number of pages to fetch.
*/
this.count = 0
/**
* Parser instance to handle the generator string.
*/
this.parser = null
/**
* The URI of the first page to fetch.
*/
this.uri = uri
this.count = count || this.count
this.parser = parser
}
/**
* Scrape a web page.
*
* This function inspects the host of the web page and invokes an
* appropriate scraping function. The scraping functions are written
* in the following manner: they take the web page URI as input,
* fetch the page, and return a generator string as output (wrapped
* in a Promise). Schematically:
*
* web page: generator string
* +-------------------+ (Promise):
* | track1 by artist1 | scraping
* +-------------------+ function artist1 - track1
* | track2 by artist2 | =======> artist2 - track2
* +-------------------+ artist3 - track3
* | track3 by artist3 |
* +-------------------+
*
* In the example above, the scraping function converts a table of
* tracks to a generator string on the form `ARTIST - TRACK`. If the
* input were an albums chart, then the output would be a string of
* `#album` commands instead. In other words, the scraping function
* should extract the *meaning* of the web page and express it as
* input to the generator.
*
* @param {string} uri - The URI of the web page to scrape.
* @param {integer} count - Number of pages to fetch.
* @return {Promise | string} A generator string.
*/
WebScraper.prototype.scrape = function (uri, count) {
var domain = URI(uri).domain()
if (domain === 'last.fm') {
return this.lastfm(uri, count)
} else if (domain === 'pitchfork.com') {
return this.pitchfork(uri, count)
} else if (domain === 'rateyourmusic.com') {
return this.rateyourmusic(uri, count)
} else if (domain === 'reddit.com') {
return this.reddit(uri, count)
} else if (domain === 'youtube.com') {
return this.youtube(uri)
} else {
return this.webpage(uri)
}
}
/**
* Create a queue of tracks.
* @param {string} result - A newline-separated list of tracks.
* @return {Promise | Queue} A queue of results.
*/
WebScraper.prototype.createQueue = function (result) {
var collection = this.parser.parse(result)
return collection.dispatch()
}
/**
* Dispatch entry.
* @return {Promise | Queue} A queue of results.
*/
WebScraper.prototype.dispatch = function () {
var self = this
return this.scrape(this.uri, this.count).then(function (result) {
return self.createQueue(result)
})
}
/**
* Clean up a string's contents.
* @return {string} A new string.
*/
WebScraper.prototype.cleanup = function (str) {
str = str.replace(/].*/gi, ']')
.replace(/\).*/gi, ')')
.replace(/\[[^\]]*]/gi, '')
.replace(/\([^)]*\)/gi, '')
.replace(/-+/gi, '-')
.replace(/\.+/gi, '.')
.replace(/[^-'.\w\s]/gi, '')
str = this.trim(str)
return str
}
/**
* Clean up a string's whitespace.
* @return {string} A new string.
*/
WebScraper.prototype.trim = function (str) {
str = str || ''
str = str.trim()
str = str.replace(/[\s]+/g, ' ')
str = util.toAscii(str)
return str
}
/**
* Scrape a Last.fm tracklist.
* @param {string} uri - The URI of the web page to scrape.
* @param {integer} [count] - The number of pages to scrape.
* @return {Promise | string} A newline-separated list of tracks.
*/
WebScraper.prototype.lastfm = function (uri, count) {
var self = this
count = count || 1
function getPages (nextUri, result, count) {
nextUri = URI(nextUri).absoluteTo(uri).toString()
console.log(nextUri)
return http(nextUri).then(function (data) {
var html = $($.parseHTML(data))
var lines = ''
if (uri.match(/\/\+tracks/gi)) {
// tracks by a single artist
var header = html.find('header a.library-header-crumb')
if (header.length === 0) {
header = html.find('h1.header-title')
}
var artist = self.trim(header.first().text())
html.find('td.chartlist-name').each(function () {
lines += artist + ' - ' + self.trim($(this).text()) + '\n'
})
} else if (uri.match(/\/\+similar/gi)) {
// similar artists
html.find('h3.big-artist-list-title').each(function () {
lines += '#top ' + self.trim($(this).text()) + '\n'
})
} else if (uri.match(/\/artists/gi)) {
// list of artists
html.find('td.chartlist-name').each(function () {
lines += '#top ' + self.trim($(this).text()) + '\n'
})
} else if (uri.match(/\/albums/gi)) {
// list of albums
html.find('td.chartlist-name').each(function () {
lines += '#album ' + self.trim($(this).text()) + '\n'
})
} else {
// list of tracks by various artists
html.find('td.chartlist-name').each(function () {
lines += self.trim($(this).text()) + '\n'
})
}
console.log(lines.trim())
result += lines
if (count === 1) {
return result
} else {
var next = html.find('.pagination-next a')
if (next.length > 0) {
nextUri = next.attr('href')
return getPages(nextUri, result, count - 1)
} else {
return result
}
}
})
}
return getPages(uri, '', count)
}
/**
* Scrape a Pitchfork list.
* @param {string} uri - The URI of the web page to scrape.
* @param {integer} [count] - The number of pages to scrape.
* @return {Promise | string} A newline-separated list of albums.
*/
WebScraper.prototype.pitchfork = function (uri, count) {
var self = this
count = count || 0
function getPages (nextUri, result, count) {
nextUri = URI(nextUri).absoluteTo(uri).toString()
console.log(nextUri)
return http(nextUri).then(function (data) {
var html = $($.parseHTML(data))
var lines = ''
html.find('div.artist-work').each(function () {
var artist = self.trim($(this).find('ul.artist-list li:first').text())
var album = self.trim($(this).find('h2.work-title').text())
lines += '#album ' + artist + ' - ' + album + '\n'
})
console.log(lines.trim())
result += lines
if (count === 1) {
return result
} else {
var nextPage = html.find('.fts-pagination__list-item--active').next()
if (nextPage.length > 0) {
nextUri = nextPage.find('a').attr('href')
return getPages(nextUri, result, count - 1)
} else {
return result
}
}
})
}
return getPages(uri, '', count)
}
/**
* Scrape a Rate Your Music chart.
* @param {string} uri - The URI of the web page to scrape.
* @param {integer} [count] - The number of pages to scrape.
* @return {Promise | string} A newline-separated list of albums.
*/
WebScraper.prototype.rateyourmusic = function (uri, count) {
var self = this
count = count || 0
function getPages (nextUri, result, count) {
nextUri = URI(nextUri).absoluteTo(uri).toString()
console.log(nextUri)
return http(nextUri).then(function (data) {
var html = $($.parseHTML(data))
var lines = ''
html.find('div.chart_details').each(function () {
var artist = self.trim($(this).find('a.artist').text())
var album = self.trim($(this).find('a.album').text())
lines += '#album ' + artist + ' - ' + album + '\n'
})
console.log(lines.trim())
result += lines
if (count === 1) {
return result
} else {
var next = html.find('a.navlinknext')
if (next.length > 0) {
nextUri = next.attr('href')
return getPages(nextUri, result, count - 1)
} else {
return result
}
}
})
}
return getPages(uri, '', count)
}
/**
* Scrape a Reddit forum.
*
* Handles post listing and comment threads. Employs Bob Nisco's
* heuristic for parsing comments.
*
* @param {string} uri - The URI of the web page to scrape.
* @param {integer} [count] - The number of pages to scrape.
* @return {Promise | string} A newline-separated list of tracks.
*/
WebScraper.prototype.reddit = function (uri, count) {
var self = this
count = count || 1
function getPages (nextUri, result, count) {
nextUri = URI(nextUri).absoluteTo(uri).toString()
console.log(nextUri)
return http(nextUri).then(function (data) {
var html = $($.parseHTML(data))
var lines = ''
if (uri.match(/\/comments\//gi)) {
// comments thread
html.find('div.entry div.md').each(function () {
// first assumption: if there are links,
// they are probably links to songs
var links = $(this).find('a')
if (links.length > 0) {
links.each(function () {
var txt = $(this).text()
if (!txt.match(/https?:/gi)) {
lines += self.cleanup(txt) + '\n'
}
})
return
}
// second assumption: if there are multiple sentences,
// the song is the first one
var body = $(this).text()
var sentences = body.split('.')
if (sentences.length > 1) {
lines += self.cleanup(sentences[0]) + '\n'
return
}
// third assumption: if there are multiple lines to a comment,
// then the song will be on the first line with a user's
// comments on other lines after it
var lines = body.split('\n')
if (lines.length > 1) {
lines += self.cleanup(lines[0]) + '\n'
return
}
// fall-back case
lines += self.cleanup(body) + '\n'
})
} else {
// post listing
html.find('a.title').each(function () {
var track = self.cleanup($(this).text())
lines += track + '\n'
})
}
console.log(lines.trim())
result += lines
if (count === 1) {
return result
} else {
var next = html.find('.next-button a')
if (next.length > 0) {
nextUri = next.attr('href')
return getPages(nextUri, result, count - 1)
} else {
return result
}
}
})
}
return getPages(uri, '', count)
}
/**
* Scrape a web page.
*
* This is a fall-back function in case none of the other
* scraping functions apply.
*
* @param {string} uri - The URI of the web page to scrape.
* @return {Promise | string} A newline-separated list of tracks.
*/
WebScraper.prototype.webpage = function (uri) {
var self = this
console.log(uri)
return http(uri).then(function (data) {
var html = $($.parseHTML(data))
var result = ''
html.find('a').each(function () {
var track = self.cleanup($(this).text())
result += track + '\n'
})
result = result.trim()
console.log(result)
return result
})
}
/**
* Scrape a YouTube playlist.
* @param {string} uri - The URI of the web page to scrape.
* @return {Promise | string} A newline-separated list of tracks.
*/
WebScraper.prototype.youtube = function (uri) {
var self = this
console.log(uri)
return http(uri).then(function (data) {
var html = $($.parseHTML(data))
var result = ''
html.find('div.playlist-video-description h4, a.pl-video-title-link').each(function () {
var track = self.cleanup($(this).text())
result += track + '\n'
})
result = result.trim()
console.log(result)
return result
})
}
module.exports = WebScraper