"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.JSDOMCrawler = void 0;
exports.domCrawlerEnqueueLinks = domCrawlerEnqueueLinks;
exports.createJSDOMRouter = createJSDOMRouter;
const tslib_1 = require("tslib");
const http_1 = require("@crawlee/http");
const utils_1 = require("@crawlee/utils");
const cheerio = tslib_1.__importStar(require("cheerio"));
const jsdom_1 = require("jsdom");
const ow_1 = tslib_1.__importDefault(require("ow"));
const timeout_1 = require("@apify/timeout");
const utilities_1 = require("@apify/utilities");
/**
 * Provides a framework for the parallel crawling of web pages using plain HTTP requests and
 * [jsdom](https://www.npmjs.com/package/jsdom) JSDOM implementation.
 * The URLs to crawl are fed either from a static list of URLs
 * or from a dynamic queue of URLs enabling recursive crawling of websites.
 *
 * Since `JSDOMCrawler` uses raw HTTP requests to download web pages,
 * it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
 * to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead,
 * because it loads the pages using full-featured headless Chrome browser.
 *
 * Alternatively, you can use {@link JSDOMCrawlerOptions.runScripts} to run website scripts in Node.
 * JSDOM does not implement all the standards, so websites can break.
 *
 * **Limitation**:
 * This crawler does not support proxies and cookies yet (each open starts with empty cookie store), and the user agent is always set to `Chrome`.
 *
 * `JSDOMCrawler` downloads each URL using a plain HTTP request,
 * parses the HTML content using [JSDOM](https://www.npmjs.com/package/jsdom)
 * and then invokes the user-provided {@link JSDOMCrawlerOptions.requestHandler} to extract page data
 * using the `window` object.
 *
 * The source URLs are represented using {@link Request} objects that are fed from
 * {@link RequestList} or {@link RequestQueue} instances provided by the {@link JSDOMCrawlerOptions.requestList}
 * or {@link JSDOMCrawlerOptions.requestQueue} constructor options, respectively.
 *
 * If both {@link JSDOMCrawlerOptions.requestList} and {@link JSDOMCrawlerOptions.requestQueue} are used,
 * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
 * to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
 *
 * The crawler finishes when there are no more {@link Request} objects to crawl.
 *
 * We can use the `preNavigationHooks` to adjust `gotOptions`:
 *
 * ```
 * preNavigationHooks: [
 *     (crawlingContext, gotOptions) => {
 *         // ...
 *     },
 * ]
 * ```
 *
 * By default, `JSDOMCrawler` only processes web pages with the `text/html`
 * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
 * and skips pages with other content types. If you want the crawler to process other content types,
 * use the {@link JSDOMCrawlerOptions.additionalMimeTypes} constructor option.
 * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
 * For more details, see {@link JSDOMCrawlerOptions.requestHandler}.
 *
 * New requests are only dispatched when there is enough free CPU and memory available,
 * using the functionality provided by the {@link AutoscaledPool} class.
 * All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
 * parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
 * {@link AutoscaledPool} options are available directly in the `CheerioCrawler` constructor.
 *
 * **Example usage:**
 *
 * ```javascript
 * const crawler = new JSDOMCrawler({
 *     async requestHandler({ request, window }) {
 *         await Dataset.pushData({
 *             url: request.url,
 *             title: window.document.title,
 *         });
 *     },
 * });
 *
 * await crawler.run([
 *     'http://crawlee.dev',
 * ]);
 * ```
 * @category Crawlers
 */
const resources = new jsdom_1.ResourceLoader({
    // Copy from /packages/browser-pool/src/abstract-classes/browser-plugin.ts:17
    // in order not to include the entire package here
    userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
});
class JSDOMCrawler extends http_1.HttpCrawler {
    constructor(options = {}, config) {
        const { runScripts = false, hideInternalConsole = false, ...httpOptions } = options;
        super(httpOptions, config);
        Object.defineProperty(this, "runScripts", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        Object.defineProperty(this, "hideInternalConsole", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        Object.defineProperty(this, "virtualConsole", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: null
        });
        Object.defineProperty(this, "jsdomErrorHandler", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: (error) => this.log.debug('JSDOM error from console', error)
        });
        this.runScripts = runScripts;
        this.hideInternalConsole = hideInternalConsole;
    }
    /**
     * Returns the currently used `VirtualConsole` instance. Can be used to listen for the JSDOM's internal console messages.
     *
     * If the `hideInternalConsole` option is set to `true`, the messages aren't logged to the console by default,
     * but the virtual console can still be listened to.
     *
     * **Example usage:**
     * ```javascript
     * const console = crawler.getVirtualConsole();
     * console.on('error', (e) => {
     *     log.error(e);
     * });
     * ```
     */
    getVirtualConsole() {
        if (this.virtualConsole) {
            return this.virtualConsole;
        }
        this.virtualConsole = new jsdom_1.VirtualConsole();
        if (!this.hideInternalConsole) {
            this.virtualConsole.sendTo(console, { omitJSDOMErrors: true });
        }
        this.virtualConsole.on('jsdomError', this.jsdomErrorHandler);
        return this.virtualConsole;
    }
    async _cleanupContext(context) {
        this.getVirtualConsole().off('jsdomError', this.jsdomErrorHandler);
        context.window?.close();
    }
    async _parseHTML(response, isXml, crawlingContext) {
        const body = await (0, utilities_1.concatStreamToBuffer)(response);
        const { window } = new jsdom_1.JSDOM(body, {
            url: response.url,
            contentType: isXml ? 'text/xml' : 'text/html',
            runScripts: this.runScripts ? 'dangerously' : undefined,
            resources,
            virtualConsole: this.getVirtualConsole(),
            pretendToBeVisual: true,
        });
        // add some stubs in place of missing API so processing won't fail
        Object.defineProperty(window, 'matchMedia', {
            writable: true,
            value: (query) => ({
                matches: false,
                media: query,
                onchange: null,
                addListener: () => { },
                removeListener: () => { },
                addEventListener: () => { },
                removeEventListener: () => { },
                dispatchEvent: () => { },
            }),
        });
        window.document.createRange = () => {
            const range = new window.Range();
            range.getBoundingClientRect = () => ({});
            range.getClientRects = () => ({ item: () => null, length: 0 });
            return range;
        };
        if (this.runScripts) {
            try {
                await (0, timeout_1.addTimeoutToPromise)(async () => {
                    return new Promise((resolve) => {
                        window.addEventListener('load', () => {
                            resolve();
                        }, false);
                    }).catch();
                }, 10000, 'Window.load event not fired after 10 seconds.').catch();
            }
            catch (e) {
                this.log.debug(e.message);
            }
        }
        return {
            window,
            get body() {
                return window.document.documentElement.outerHTML;
            },
            get document() {
                return window.document;
            },
            enqueueLinks: async (enqueueOptions) => {
                return domCrawlerEnqueueLinks({
                    options: { ...enqueueOptions, limit: this.calculateEnqueuedRequestLimit(enqueueOptions?.limit) },
                    window,
                    requestQueue: await this.getRequestQueue(),
                    robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
                    onSkippedRequest: this.handleSkippedRequest,
                    originalRequestUrl: crawlingContext.request.url,
                    finalRequestUrl: crawlingContext.request.loadedUrl,
                });
            },
        };
    }
    async _runRequestHandler(context) {
        context.waitForSelector = async (selector, timeoutMs = 5000) => {
            const $ = cheerio.load(context.body);
            if ($(selector).get().length === 0) {
                if (timeoutMs) {
                    await (0, utils_1.sleep)(50);
                    await context.waitForSelector(selector, Math.max(timeoutMs - 50, 0));
                    return;
                }
                throw new Error(`Selector '${selector}' not found.`);
            }
        };
        context.parseWithCheerio = async (selector, _timeoutMs = 5000) => {
            const $ = cheerio.load(context.body);
            if (selector && $(selector).get().length === 0) {
                throw new Error(`Selector '${selector}' not found.`);
            }
            return $;
        };
        await super._runRequestHandler(context);
    }
}
exports.JSDOMCrawler = JSDOMCrawler;
Object.defineProperty(JSDOMCrawler, "optionsShape", {
    enumerable: true,
    configurable: true,
    writable: true,
    value: {
        ...http_1.HttpCrawler.optionsShape,
        runScripts: ow_1.default.optional.boolean,
        hideInternalConsole: ow_1.default.optional.boolean,
    }
});
/** @internal */
function containsEnqueueLinks(options) {
    return !!options.enqueueLinks;
}
/** @internal */
async function domCrawlerEnqueueLinks(options) {
    const { options: enqueueLinksOptions, window, originalRequestUrl, finalRequestUrl } = options;
    if (!window) {
        throw new Error('Cannot enqueue links because the JSDOM is not available.');
    }
    const baseUrl = (0, http_1.resolveBaseUrlForEnqueueLinksFiltering)({
        enqueueStrategy: enqueueLinksOptions?.strategy,
        finalRequestUrl,
        originalRequestUrl,
        userProvidedBaseUrl: enqueueLinksOptions?.baseUrl,
    });
    const urls = extractUrlsFromWindow(window, enqueueLinksOptions?.selector ?? 'a', enqueueLinksOptions?.baseUrl ?? finalRequestUrl ?? originalRequestUrl);
    if (containsEnqueueLinks(options)) {
        return options.enqueueLinks({
            urls,
            baseUrl,
            ...enqueueLinksOptions,
        });
    }
    return (0, http_1.enqueueLinks)({
        requestQueue: options.requestQueue,
        robotsTxtFile: options.robotsTxtFile,
        onSkippedRequest: options.onSkippedRequest,
        urls,
        baseUrl,
        ...enqueueLinksOptions,
    });
}
/**
 * Extracts URLs from a given Window object.
 * @ignore
 */
function extractUrlsFromWindow(window, selector, baseUrl) {
    return Array.from(window.document.querySelectorAll(selector))
        .map((e) => e.href)
        .filter((href) => href !== undefined && href !== '')
        .map((href) => {
        if (href === undefined) {
            return undefined;
        }
        return (0, http_1.tryAbsoluteURL)(href, baseUrl);
    })
        .filter((href) => href !== undefined && href !== '');
}
/**
 * Creates new {@link Router} instance that works based on request labels.
 * This instance can then serve as a `requestHandler` of your {@link JSDOMCrawler}.
 * Defaults to the {@link JSDOMCrawlingContext}.
 *
 * > Serves as a shortcut for using `Router.create<JSDOMCrawlingContext>()`.
 *
 * ```ts
 * import { JSDOMCrawler, createJSDOMRouter } from 'crawlee';
 *
 * const router = createJSDOMRouter();
 * router.addHandler('label-a', async (ctx) => {
 *    ctx.log.info('...');
 * });
 * router.addDefaultHandler(async (ctx) => {
 *    ctx.log.info('...');
 * });
 *
 * const crawler = new JSDOMCrawler({
 *     requestHandler: router,
 * });
 * await crawler.run();
 * ```
 */
function createJSDOMRouter(routes) {
    return http_1.Router.create(routes);
}
//# sourceMappingURL=jsdom-crawler.js.map