Source: connectors/scraping-connector.js

import request from 'superagent'
import * as cj from 'cookiejar'

/**
 * Cookie-rewriting details:
 *
 * ScrapingConnector uses a cookie-jar (through superagent) to
 * handle sessions on traditional HTML backends. This lets
 * us make authenticated request client- *and* server-side,
 * for Desktop, etc.
 *
 * To maintain sessions when running server-side, ScrapingConnector
 * supports cookie re-writing. Eg:
 *
 *  1) Imagine you are running shop.com (the real backend) and
 *     upwa-shop.com (a server-side proxy using this Connector).
 *
 *  2) A browser requests a resource from the proxy, which
 *     forwards a request to shop.com. The shop.com response
 *     contains the header "set-cookie: session=1; domain=shop.com"
 *
 *  3) Before responding to the browser, this connector
 *     rewrites the cookie domain to proxy-shop.com, so that
 *     it will be included on the subsequent request from the
 *     browser to the proxy.
 */

/**
 * A base class for screen-scraping connectors.
 *
 * ScrapingConnector does three things:
 *
 *  1) Allows `window` to be injected, so we can swap it for JSDOM
 *     and run the Connector outside of a browser.
 *
 *  2) Supports cookie-based sessions server-side, using a
 *     cookie-jar and domain-name rewriting.
 *
 *  3) Lets us get/set default headers for requests.
 *
 */
export class ScrapingConnector {
    /**
     * @param {Window} window A browser-native window or JSDOM implementation.
     * @param {Object.<string, string>} cookieDomainRewrites the domains to rewrite
     *   on any cookies received from the backend, eg.`{"shop.com": "proxy-shop.com"}`
     */
    constructor({window, cookieDomainRewrites}) {
        this.window = window
        this.cookieDomainRewrites = cookieDomainRewrites || {}
        this.agent = request.agent()
        this.cookieJarSnapshot = null
    }

    /**
     * Return a Document constructed from a superagent HTTP response. Use
     * to parse content out of the DOM.
     *
     * @param {HTTPResponse} res a superagent http response, expected to be HTML.
     * @returns {Document}
     */
    buildDocument(res) {
        const doc = this.window.document.implementation.createHTMLDocument()
        doc.documentElement.innerHTML = res.text
        return doc
    }

    /**
     * @inheritDoc
     */
    getDefaultHeaders() {
        const headers = this.agent._defaults
            .filter((defaultCall) => defaultCall.fn === 'set')
            .map((header) => ({[header.arguments[0]]: header.arguments[1]}))
        return Object.assign({}, headers)
    }

    /**
     * @inheritDoc
     */
    setDefaultHeaders(headers) {
        Object.keys(headers).forEach((k) => {
            this.agent.set(k, headers[k])
        })
    }

    inBrowser() {
        return typeof window !== 'undefined'
    }

    /**
     * Get all *new or changed* cookies in the Connector's cookie jar,
     * since creation or the last call to `setCookies()`.
     *
     * Each returned cookie can be set directly as a header on an HTTP
     * response in the "set-cookie" header.
     *
     * This is only for use *outside* a browser. In the browser, cookies
     * are handled automatically.
     *
     * Example return value:
     *
     *    `["a=b; domain=example.com; path=/", "c=d; domain=example2.com; path=/"]`
     *
     * @returns {Array.<String>}
     */
    getCookies() {
        if (!this.inBrowser()) {
            const rewrites = this.cookieDomainRewrites || {}
            const rewriter = (cookie) => ScrapingConnector.rewriteCookieDomain(rewrites, cookie)
            const snapshot = this.cookieJarSnapshot || new cj.CookieJar()
            return ScrapingConnector.diffCookieJars(snapshot, this.agent.jar)
                .map(rewriter)
                .map((cookie) => cookie.toString())
        } else {
            return []
        }
    }

    /**
     * Set the initial cookies on the connector.
     *
     * The `cookies` parameter must be in the format for the `Cookie` header
     * taken directly from an HTTP request, eg: `"a=b; c=d"`
     *
     * @param {String} cookies
     */
    setCookies(cookies) {
        if (!this.inBrowser()) {
            cookies = cookies || ''
            cookies
                .split('; ')
                .filter((x) => x !== '')
                .forEach((c) => this.agent.jar.setCookies(c))
            this.cookieJarSnapshot = ScrapingConnector.cloneCookieJar(this.agent.jar)
        }
    }

    static cloneCookie(cookie) {
        return Object.assign(new cj.Cookie(), cookie)
    }

    static cloneCookieJar(jar) {
        const clone = new cj.CookieJar()
        jar.getCookies(cj.CookieAccessInfo.All).forEach((c) => {
            clone.setCookie(ScrapingConnector.cloneCookie(c))
        })
        return clone
    }

    /**
     * Returns the cookies that are new or changed in `newJar`.
     *
     * @param oldJar {CookieJar}
     * @param newJar {CookieJar}
     * @returns {Array<Cookie>}
     */
    static diffCookieJars(oldJar, newJar) {
        const oldCookies = oldJar.getCookies(cj.CookieAccessInfo.All).map((c) => c.toString())
        return newJar
            .getCookies(cj.CookieAccessInfo.All)
            .filter((c) => oldCookies.indexOf(c.toString()) < 0)
    }

    /**
     * Clone `cookie` replace `originalDomain` with `proxyDomain` if the
     * original cookie had an explicit domain set.
     *
     * This is for server-side usage, eg. when running the IM as a proxy
     * on a different domain than the original ecommerce backend.
     *
     * @param {Object<String, String>} rewrites
     * @param {Array<Cookie>} cookie
     * @returns {Array<Cookie>}
     */
    static rewriteCookieDomain(rewrites, cookie) {
        const clone = ScrapingConnector.cloneCookie(cookie)
        clone.domain = rewrites[clone.domain] || clone.domain
        return clone
    }
}