'use strict';
import cheerio from 'cheerio';
import {UseMetadata, Redirect} from './results';
import RuntimeError, {UnhandledURLError, TimeoutError, NetworkError, NotFoundError, BadRequestError, UnauthorizedError, ForbiddenError, ServerError} from './errors/runtime';
import DocumentBuilder from './builder';
const REFRESH_TARGET = /\d+;\s*url=(.+)$/i;
/**
* CacheInterface is used by {@link Resolver} to read and write from a
* generic cache
*/
export class CacheInterface {
/**
* Get an item from the cache.
*
* @param {String} key The item key to read this item from.
* @param {CacheOptions} [options] Options for the cache when getting
* this result.
* @returns {CacheResult} The cached item. Can be `null`.
*/
get(key, options) {
throw new Error('Not Implemented');
}
/**
* Save an item to the cache.
*
* The return value of this method is never checked.
*
* @param {String} key The item key to save this item at.
* @param {Object} value The actual item to save to the cache.
* @param {CacheOptions} [options] Options for the cache.
*/
set(key, value, options) {
throw new Error('Not Implemented');
}
}
/**
* CacheResult
*
* @typedef {Object} CacheResult
* @property {Boolean} hit Whether or not the item was found in the cache
* @property {Object} value The item from the cache. May be `null`.
*/
/**
* CacheOptions
*
* @typedef {Object} CacheOptions
* @property {Number} ttl The number of seconds this item should remain cached.
*/
/**
* Request Context
*
* @typedef {Object} RequestContext
* @property {String|URL} url Automatic. The current URL of the request. May change
* after {@link Resolver#transformURL}.
* @property {URL} original_url Automatic. The original URL of the request.
* @property {FetchResponse} request Automatic. The `fetch` request object
* for this request. Not available within {@link Resolver#transformURL}.
*
* @property {URL} referrer The referrer of the request. May be changed
* within {@link Resolver#transformURL} to change the `Referer` header
* sent with the request.
* @property {Boolean} [skip_request] If set to true, no fetch request
* will be performed and instead {@link Resolver#processBody} will be
* called immediately with only null values and this context object.
* @property {Function} [fetch] The version of fetch to use. Useful in case
* you want to wrap fetch with something to, for example, automatically
* renew a client credential token when necessary and add it to the
* request. The custom fetch implementation should wrap our custom
* {@link fetch} with timeout support or provide an equivilent API.
* @property {Object} [headers] An optional object of headers to send
* with the request. Has no effect if set outside {@link Resolver#transformURL}.
* @property {Object} [options] An optional object of options to send to
* fetch when performing the request. Has no effect is set outside {@link Resolver#transformURL}.
* @property {Number} [timeout] The number of miliseconds after which the
* fetch request should time out. Overrides the default value from the
* {@link LinkService} options. Has no effect if set outside {@link Resolver#transformURL}.
* @property {Boolean} [follow_redirect=true] Whether or not the
* {@link Resolver} should automatically follow redirects when fetching
* the resource. Supports `Location` and `Refresh` header-based redirects.
* @property {String} [cache_key] The key to use when reading and writing
* to the configured cache. If this is not set, the request URL will be
* used instead. This should be used when more than one URL may describe
* the same resource. If explicitly set to `false`, caching will be
* disabled. This should probably never be done.
* @property {CacheOptions} [cache_opts] An optional set of extra
* options to be passed to cache methods. This can be used to override
* default timeouts, etc. depending on your {@link CacheInterface}.
* @property {String} [parse] Override how the response body should be
* parsed before {@link Resolver#processBody} is called. This can be
* set in either {@link Resolver#transformURL} or {@link Resolver#processHeaders}.
* Valid values are: `buffer`, `json`, `html`, and `xml`.
* @property {Object} [response] The data to return as a response from
* the {@link Resolver} for this request. If {@link Resolver#processBody}
* returns `null` or `undefined`, this value is returned instead. This
* can also be used to return data without ever processing the response
* body by setting it and then returning a falsey value from
* {@link Resolver#processHeaders}.
*/
/**
* Example URL
*
* @typedef {Object} ExampleURL
* @property {String|URL} url The example URL that we're describing.
* @property {String} [title] A descriptive name for the URL
* @property {String} [resolver] The name of the resolver. This is added
* to the ExampleURL automatically in {@link LinkService#getExamples}.
*/
/**
* Resolvers make requests, parse responses, and format data.
*
* @param {LinkService} service The service this Resolver is registered to.
*/
class Resolver {
constructor(service) {
this.service = service;
this.sort = this.constructor.sort ?? 0;
this.hosts = this.constructor.hosts;
this.examples = this.constructor.examples;
if ( this.fetch )
this.fetch = this.service.wrapFetch(this.fetch.bind(this));
else
this.fetch = this.service.fetch;
}
/**
* Get an array of example URLs that this Resolver can handle.
* Used for populating a selection field in testing clients.
*
* The default implementation checks if the Resolver class has
* a static array called `examples` and, if so, returns that.
*
* It is not necessary to provide examples, but examples do
* make testing easier.
*
* @example
* class MyResolver extends Resolver { };
* MyResolver.examples = [
* {title: 'Some Page', url: 'https://example.com/'}
* ];
*
* @returns {ExampleURL[]|String[]|URL[]} List of URLs.
*/
getExamples() {
return this.examples ?? null;
}
/**
* Determine whether or not this Resolver can handle a request
* for a given domain.
*
* The default implementation checks if the Resolver class has
* a static array called `hosts` and, if so, checks to see if
* the host is in that list.
*
* If you're not using the `hosts` array, you must override
* the method. Otherwise, it will throw an error.
*
* @example
* class MyResolver extends Resolver { };
* MyResolver.hosts = ['example.org'];
*
* const inst = new MyResolver(link_service);
*
* inst.handles('google.com'); // === false
* inst.handles('example.org'); // === true
* inst.handles('test.example.org'); // === true
*
* @param {String} host The domain to check.
* @returns {Boolean} Whether or not this Resolver can handle requests for that domain.
*/
handles(host) {
if ( this.hosts ) {
if ( this.hosts.includes(host) )
return true;
let i = this.hosts.length;
while (i--) {
if ( host.endsWith(this.hosts[i]) )
return true;
}
return false;
}
throw new Error('Not Implemented');
}
/**
* Create a new {@link DocumentBuilder} instance. Purely a convenience method.
* @returns {DocumentBuilder} New instance.
*/
builder() {
return new DocumentBuilder();
}
/**
* Create a URL for passing an image through a proxy, used to
* avoid leaking end-user IP addresses and to perform sanity
* checks on the contents of the image.
*
* This just calls {@link LinkService#proxyImage} as a convenience
* method.
*
* @param {String|URL} url The URL to proxy.
* @param {Number} [size=324] The size parameter to pass to the proxy server.
* @returns {String} The proxied image URL, or `null` if no proxy server is configured.
*/
proxyImage(...args) {
return this.service.proxyImage(...args);
}
/**
* The first method called while a Resolver works. This method is used for
* further processing a URL and determining what resource we actually want
* to fetch from the remove host, if any.
*
* Here, we can process a URL and, rather than requesting the normal
* webpage, redirect the request to the site's API. If we determine that we
* can't actually handle a specific URL, we can also fall back to the
* metadata provider here or outright redirect to another URL.
*
* We can also set `cache_key` on the ctx object to improve the cache hit
* rate when multiple URLs can describe the same resource.
*
* @example
* transformURL(url, ctx) {
* if ( ! url.pathname.startsWith('/video/') )
* return UseMetadata;
*
* const video_id = url.pathname.slice(7);
* ctx.cache_key = `my-service--${video_id}`;
* return `https://api.service.example/v2/video?id=${video_id}`;
* }
*
* @param {URL} url The URL we're processing.
* @param {RequestContext} ctx A context object that will be maintained
* while processing this URL to keep track of extra data.
* @returns {String|URL|UseMetadata|Redirect} If a String or URL are
* returned, they will be requested. If {@link UseMetadata} or
* {@link Redirect} are returned, the Resolver will pass control back
* to its {@link LinkService}.
*/
transformURL(url, ctx) { // eslint-disable-line no-unused-vars
return url;
}
/**
* The second method called while a Resolver works. This method is used
* for determining what to do once we've received response headers.
*
* The default implementation of this method just returns `request.ok`
* to request response body handling if the request is okay. (Meaning:
* the status code was in the range of 200-299.)
*
* In some cases, we'll receive all the information we need in just the
* response headers. In those cases, we can return a falsey value from
* this method to avoid parsing the response body at all.
*
* If we need to redirect, or we determine that the metadata resolver
* would have better results, we can also fall back to those behaviors.
*
* > **Note:** If `ctx.follow_redirects` has not been set to false,
* > the {@link Resolver} instance will automatically handle `Location`
* > and `Refresh` headers. The following example is only an example
* > and does not need to be replicated in your own functions.
*
* @example
* processHeaders(request, ctx) {
* if ( ! request.ok )
* return false;
*
* if ( request.headers.has('Location') )
* return new Redirect(request.headers.get('Location'), ctx.url);
*
* return true;
* }
*
* @param {FetchResponse} request The result after waiting for our `fetch`
* request to resolve.
* @param {RequestContext} ctx A context object that will be maintained
* while processing this URL to keep track of extra data.
* @returns {Boolean|UseMetadata|Redirect} If {@link UseMetadata} or
* {@link Redirect} are returned, the Resolver will pass control back
* to its {@link LinkService}. Otherwise, the truthiness of the return
* value will be used to determine whether or not we should spend the
* time to handle the response body.
*/
processHeaders(request, ctx) { // eslint-disable-line no-unused-vars
return request.ok;
}
/**
* The final method called while a Resolver works. This method is used
* for handling the parsed response body. If this method returns a
* non-`null` value, that value will be the result emitted from the
* {@link LinkService}.
*
* Depending on the parsing mode, `body` will be one of several different
* objects. If the `mode` is `buffer`, then `body` will be a {@link Buffer}
* instance as returned from {@link node-fetch}. If `mode` is `json`, then
* `body` will be the parsed JSON object.
*
* If `mode` is `html` or `xml`, then `body` will be a
* [cheerio](https://www.npmjs.com/package/cheerio) instance.
*
* @example
* processBody(body, mode) {
* if ( ! body?.video || mode !== 'json' )
* return UseMetadata;
*
* return {
* v: 5,
* accent: '#f00',
* short: this.builder()
* .setTitle(body.video.title)
* .setSubtitle('Example Service')
* .setLogo(SERVICE_LOGO)
* .addImage(body.video.thumbnail)
* .addField(
* i18nToken('embed.example.length', 'Length'),
* formatToken('duration', body.video.length)
* )
* };
* }
*
* @param {Buffer|Object|cheerio} body The parsed response body. This can
* be one of several different objects, depending on the detected
* `Content-Type` of the response. If `ctx.parse` is set, the response
* body will be parsed in that manner rather than through content detection.
* @param {String} mode The mode used for parsing the response body. This
* will be one of: `buffer`, `json`, `html`, or `xml`
* @param {RequestContext} ctx A context object that will be maintained
* while processing this URL to keep track of extra data.
* @param {FetchResponse} request The result of our `fetch` request, in
* case it's still needed for some reason.
* @returns {Object|UseMetadata|Redirect} If an object is returned, that
* data will be used as the final response. If {@link UseMetadata} or
* {@link Redirect} are returned, the Resolver will pass control back
* to its {@link LinkService}.
*/
processBody(body, mode, ctx, request) { // eslint-disable-line no-unused-vars
throw new Error('Not Implemented');
}
async _run(url, referrer, cookies) {
const ctx = {
url,
original_url: url,
referrer,
cookies,
follow_redirects: true
};
// Step 1. URL Transformation
let request_url = await this.transformURL(url, ctx);
if ( ! request_url )
throw new UnhandledURLError(url);
else if ( request_url === UseMetadata || request_url instanceof UseMetadata || request_url instanceof Redirect )
return request_url;
// Step 2. Caching
if ( this.service.cache && ctx.cache_key !== false ) {
if ( ctx.cache_key == null )
ctx.cache_key = request_url.toString();
if ( ctx.cache_key ) {
const resp = await this.service.cache.get(ctx.cache_key, ctx.cache_opts);
if ( resp?.hit ) {
let value = resp.value;
const type = value?.__type;
if ( type === 'redirect' )
value = new Redirect(value.url, value.base, value.silent ?? false);
else if ( type === 'use-metadata' )
value = UseMetadata;
if ( typeof value === 'object' )
value.cache = 'hit';
return resp.value;
}
}
}
// Step 1 (Again). Finish transforming URL
if ( ! ctx.fetch && !(request_url instanceof URL) )
request_url = new URL(request_url);
// Step 3. The Request
let data;
if ( ! ctx.skip_request ) {
if ( ctx.referrer )
referrer = ctx.referrer;
ctx.url = request_url;
const ref_header = referrer ? referrer.toString() : this.service.opts.default_referrer;
let headers = {
'User-Agent': this.service.opts.user_agent,
'Sec-Fetch-Mode': 'navigate'
};
if ( ref_header )
headers.Referer = ref_header;
if ( ctx.headers )
headers = {...headers, ...ctx.headers};
// Let individual resolvers override fetch if they need to.
const req_fetch = ctx.fetch ?? this.fetch;
let options = {
headers,
redirect: 'manual',
size: 5000000,
timeout: ctx.timeout ?? this.service.opts.resolver_timeout
};
if ( ctx.options )
options = Object.assign(options, ctx.options);
const immediate = ctx.follow_redirects === 'immediate',
visited = immediate ? new Set() : null;
let request;
let status;
while(true) {
const req_str = request_url.toString();
// Infinite loop protection for immediate mode.
if ( immediate ) {
if ( visited.has(req_str) )
throw new RedirectLoopError();
visited.add(req_str);
}
try {
request = ctx.request = await req_fetch(req_str, options, ctx.cookies);
} catch (err) {
if ( err.type === 'aborted' )
throw new TimeoutError;
else
throw new NetworkError;
}
if ( ! request ) {
console.error(new RuntimeError('Missing Fetch Result', ctx.fetch), request_url);
throw new NetworkError();
}
status = request.status;
// Redirect Check: Support Location for 3xx, and Refresh for all response codes.
if ( ctx.follow_redirects ) {
let redirect = null;
if ( status >= 300 && status < 400 )
redirect = request.headers.get('Location');
else if ( request.headers.has('Refresh') ) {
const match = REFRESH_TARGET.exec(request.headers.get('Refresh'));
if ( match )
redirect = match[1];
}
if ( redirect ) {
request.abort();
// In immediate mode, keep going.
if ( immediate ) {
request_url = new URL(redirect, request_url);
continue;
} else {
// Otherwise, return a redirect response.
const out = new Redirect(redirect, ctx.url);
if ( this.service.cache && ctx.cache_key )
await this.service.cache.set(ctx.cache_key, out, ctx.cache_opts);
return out;
}
}
}
// If we got here, we're not redirecting.
break;
}
// Step 4. Process Headers
let wants_body = await this.processHeaders(request, ctx);
data = null;
// Step 5. Process Body
if ( wants_body === UseMetadata || wants_body instanceof UseMetadata || wants_body instanceof Redirect ) {
data = wants_body;
} else if ( wants_body ) {
// Process the body.
if ( wants_body === true )
wants_body = ctx.parse || true;
if ( wants_body === true ) {
const content_type = request.headers.get('content-type') || '';
if ( content_type.includes('application/json') )
wants_body = 'json';
else if ( content_type.includes('text/html') || content_type.includes('application/xhtml+xml') )
wants_body = 'html';
else if ( content_type.includes('xml') )
wants_body = 'xml';
}
let body, mode = null;
try {
if ( wants_body === 'buffer' ) {
body = await request.buffer();
mode = 'buffer';
} else if ( wants_body === 'json' ) {
body = await request.json();
mode = 'json';
} else if ( wants_body === 'html' ) {
const raw = await request.text();
body = cheerio.load(raw);
mode = 'html';
} else if ( wants_body === 'xml' ) {
const raw = await request.text();
body = cheerio.load(raw, {xmlMode: true});
mode = 'xml';
} else if ( wants_body === 'text' ) {
body = await request.text();
mode = 'text';
}
} catch (err) {
request.abort();
console.error(err);
return ctx.response;
}
data = await this.processBody(body, mode, ctx, request);
}
// Step 6. Finish up.
data = data ?? ctx.response;
if ( ! data ) {
if ( status === 404 )
data = new NotFoundError();
else if ( status === 400 )
data = new BadRequestError();
else if ( status === 401 )
data = new UnauthorizedError();
else if ( status === 403 )
data = new ForbiddenError();
else if ( status >= 500 && status < 600 )
data = new ServerError();
}
// Make sure we're done. We should be.
request.abort();
} else {
// If we didn't want to make a request, just call
// processBody directly.
data = await this.processBody(null, null, ctx, null);
}
let out = data ?? ctx.response;
// Make sure we haven't done any oopses with our builders.
if ( out instanceof DocumentBuilder ) {
const obj = out.done().toJSON();
out = {v: 5};
if ( obj ) {
if ( ! Array.isArray(obj) ) {
if ( obj.type === 'header' )
out.short = obj;
else
out.full = obj;
} else {
out.full = obj;
if ( obj[0].type === 'header' )
out.short = obj[0];
}
} else
out = null;
} else {
if ( out?.short instanceof DocumentBuilder )
out.short = out.short.done();
if ( out?.mid instanceof DocumentBuilder )
out.mid = out.mid.done();
if ( out?.full instanceof DocumentBuilder )
out.full = out.full.done();
if ( out?.fragments ) {
for (const [key, val] of Object.entries(out.fragments))
if (val instanceof DocumentBuilder)
out.fragments[key] = val.done();
}
}
if ( this.service.cache && ctx.cache_key )
await this.service.cache.set(ctx.cache_key, out, ctx.cache_opts);
if ( this.service.cache && out )
out.cache = 'miss';
if ( ctx.cache_key === false && out )
out.cache = 'off';
return out;
}
}
export default Resolver;