"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.AdaptivePlaywrightCrawler = void 0;
exports.createAdaptivePlaywrightRouter = createAdaptivePlaywrightRouter;
const tslib_1 = require("tslib");
const browser_1 = require("@crawlee/browser");
const core_1 = require("@crawlee/core");
const utils_1 = require("@crawlee/utils");
const cheerio_1 = require("cheerio");
const lodash_isequal_1 = tslib_1.__importDefault(require("lodash.isequal"));
const timeout_1 = require("@apify/timeout");
const playwright_crawler_1 = require("./playwright-crawler");
const rendering_type_prediction_1 = require("./utils/rendering-type-prediction");
class AdaptivePlaywrightCrawlerStatistics extends core_1.Statistics {
    constructor(options = {}) {
        super(options);
        Object.defineProperty(this, "state", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: null
        }); // this needs to be assigned for a valid override, but the initialization is done by a reset() call from the parent constructor
        this.reset();
    }
    reset() {
        super.reset();
        this.state.httpOnlyRequestHandlerRuns = 0;
        this.state.browserRequestHandlerRuns = 0;
        this.state.renderingTypeMispredictions = 0;
    }
    async _maybeLoadStatistics() {
        await super._maybeLoadStatistics();
        const savedState = await this.keyValueStore?.getValue(this.persistStateKey);
        if (!savedState) {
            return;
        }
        this.state.httpOnlyRequestHandlerRuns = savedState.httpOnlyRequestHandlerRuns;
        this.state.browserRequestHandlerRuns = savedState.browserRequestHandlerRuns;
        this.state.renderingTypeMispredictions = savedState.renderingTypeMispredictions;
    }
    trackHttpOnlyRequestHandlerRun() {
        var _a;
        (_a = this.state).httpOnlyRequestHandlerRuns ?? (_a.httpOnlyRequestHandlerRuns = 0);
        this.state.httpOnlyRequestHandlerRuns += 1;
    }
    trackBrowserRequestHandlerRun() {
        var _a;
        (_a = this.state).browserRequestHandlerRuns ?? (_a.browserRequestHandlerRuns = 0);
        this.state.browserRequestHandlerRuns += 1;
    }
    trackRenderingTypeMisprediction() {
        var _a;
        (_a = this.state).renderingTypeMispredictions ?? (_a.renderingTypeMispredictions = 0);
        this.state.renderingTypeMispredictions += 1;
    }
}
const proxyLogMethods = [
    'error',
    'exception',
    'softFail',
    'info',
    'debug',
    'perf',
    'warningOnce',
    'deprecated',
];
/**
 * An extension of {@link PlaywrightCrawler} that uses a more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible.
 *
 * **Example usage:**
 *
 * ```javascript
 * const crawler = new AdaptivePlaywrightCrawler({
 *     renderingTypeDetectionRatio: 0.1,
 *     async requestHandler({ querySelector, pushData, enqueueLinks, request, log }) {
 *         // This function is called to extract data from a single web page
 *         const $prices = await querySelector('span.price')
 *
 *         await pushData({
 *             url: request.url,
 *             price: $prices.filter(':contains("$")').first().text(),
 *         })
 *
 *         await enqueueLinks({ selector: '.pagination a' })
 *     },
 * });
 *
 * await crawler.run([
 *     'http://www.example.com/page-1',
 *     'http://www.example.com/page-2',
 * ]);
 * ```
 *
 * @experimental
 */
class AdaptivePlaywrightCrawler extends playwright_crawler_1.PlaywrightCrawler {
    constructor(options = {}, config = core_1.Configuration.getGlobalConfig()) {
        const { requestHandler, renderingTypeDetectionRatio = 0.1, renderingTypePredictor, resultChecker, resultComparator, statisticsOptions, preventDirectStorageAccess = true, ...rest } = options;
        super(rest, config);
        Object.defineProperty(this, "config", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: config
        });
        Object.defineProperty(this, "adaptiveRequestHandler", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        Object.defineProperty(this, "renderingTypePredictor", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        Object.defineProperty(this, "resultChecker", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        Object.defineProperty(this, "resultComparator", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        Object.defineProperty(this, "preventDirectStorageAccess", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: void 0
        });
        /**
         * Default {@link Router} instance that will be used if we don't specify any {@link AdaptivePlaywrightCrawlerOptions.requestHandler|`requestHandler`}.
         * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
         */
        // @ts-ignore
        Object.defineProperty(this, "router", {
            enumerable: true,
            configurable: true,
            writable: true,
            value: core_1.Router.create()
        });
        this.adaptiveRequestHandler = requestHandler ?? this.router;
        this.renderingTypePredictor =
            renderingTypePredictor ?? new rendering_type_prediction_1.RenderingTypePredictor({ detectionRatio: renderingTypeDetectionRatio });
        this.resultChecker = resultChecker ?? (() => true);
        if (resultComparator !== undefined) {
            this.resultComparator = resultComparator;
        }
        else if (resultChecker !== undefined) {
            this.resultComparator = (resultA, resultB) => this.resultChecker(resultA) && this.resultChecker(resultB);
        }
        else {
            this.resultComparator = (resultA, resultB) => {
                return (resultA.datasetItems.length === resultB.datasetItems.length &&
                    resultA.datasetItems.every((itemA, i) => {
                        const itemB = resultB.datasetItems[i];
                        return (0, lodash_isequal_1.default)(itemA, itemB);
                    }));
            };
        }
        this.stats = new AdaptivePlaywrightCrawlerStatistics({
            logMessage: `${this.log.getOptions().prefix} request statistics:`,
            config,
            ...statisticsOptions,
        });
        this.preventDirectStorageAccess = preventDirectStorageAccess;
    }
    async _init() {
        await this.renderingTypePredictor.initialize();
        return await super._init();
    }
    async _runRequestHandler(crawlingContext) {
        const renderingTypePrediction = this.renderingTypePredictor.predict(crawlingContext.request);
        const shouldDetectRenderingType = Math.random() < renderingTypePrediction.detectionProbabilityRecommendation;
        if (!shouldDetectRenderingType) {
            crawlingContext.log.debug(`Predicted rendering type ${renderingTypePrediction.renderingType} for ${crawlingContext.request.url}`);
        }
        if (renderingTypePrediction.renderingType === 'static' && !shouldDetectRenderingType) {
            crawlingContext.log.debug(`Running HTTP-only request handler for ${crawlingContext.request.url}`);
            this.stats.trackHttpOnlyRequestHandlerRun();
            const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext);
            if (plainHTTPRun.ok && this.resultChecker(plainHTTPRun.result)) {
                crawlingContext.log.debug(`HTTP-only request handler succeeded for ${crawlingContext.request.url}`);
                plainHTTPRun.logs?.forEach(([log, method, ...args]) => log[method](...args));
                await this.commitResult(crawlingContext, plainHTTPRun.result);
                return;
            }
            if (!plainHTTPRun.ok) {
                crawlingContext.log.exception(plainHTTPRun.error, `HTTP-only request handler failed for ${crawlingContext.request.url}`);
            }
            else {
                crawlingContext.log.warning(`HTTP-only request handler returned a suspicious result for ${crawlingContext.request.url}`);
                this.stats.trackRenderingTypeMisprediction();
            }
        }
        crawlingContext.log.debug(`Running browser request handler for ${crawlingContext.request.url}`);
        this.stats.trackBrowserRequestHandlerRun();
        // Run the request handler in a browser. The copy of the crawler state is kept so that we can perform
        // a rendering type detection if necessary. Without this measure, the HTTP request handler would run
        // under different conditions, which could change its behavior. Changes done to the crawler state by
        // the HTTP request handler will not be committed to the actual storage.
        const { result: browserRun, initialStateCopy } = await this.runRequestHandlerInBrowser(crawlingContext);
        if (!browserRun.ok) {
            throw browserRun.error;
        }
        await this.commitResult(crawlingContext, browserRun.result);
        if (shouldDetectRenderingType) {
            crawlingContext.log.debug(`Detecting rendering type for ${crawlingContext.request.url}`);
            const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext, initialStateCopy);
            const detectionResult = (() => {
                if (!plainHTTPRun.ok) {
                    return 'clientOnly';
                }
                const comparisonResult = this.resultComparator(plainHTTPRun.result, browserRun.result);
                if (comparisonResult === true || comparisonResult === 'equal') {
                    return 'static';
                }
                if (comparisonResult === false || comparisonResult === 'different') {
                    return 'clientOnly';
                }
                return undefined;
            })();
            crawlingContext.log.debug(`Detected rendering type ${detectionResult} for ${crawlingContext.request.url}`);
            if (detectionResult !== undefined) {
                this.renderingTypePredictor.storeResult(crawlingContext.request, detectionResult);
            }
        }
    }
    async commitResult(crawlingContext, { calls, keyValueStoreChanges }) {
        await Promise.all([
            ...calls.pushData.map(async (params) => crawlingContext.pushData(...params)),
            ...calls.addRequests.map(async (params) => crawlingContext.addRequests(...params)),
            ...Object.entries(keyValueStoreChanges).map(async ([storeIdOrName, changes]) => {
                const store = await crawlingContext.getKeyValueStore(storeIdOrName);
                await Promise.all(Object.entries(changes).map(async ([key, { changedValue, options }]) => store.setValue(key, changedValue, options)));
            }),
        ]);
    }
    allowStorageAccess(func) {
        return async (...args) => (0, core_1.withCheckedStorageAccess)(() => { }, async () => func(...args));
    }
    async runRequestHandlerInBrowser(crawlingContext) {
        const result = new core_1.RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
        let initialStateCopy;
        try {
            await super._runRequestHandler.call(new Proxy(this, {
                get: (target, propertyName, receiver) => {
                    if (propertyName === 'userProvidedRequestHandler') {
                        return async (playwrightContext) => (0, core_1.withCheckedStorageAccess)(() => {
                            if (this.preventDirectStorageAccess) {
                                throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler');
                            }
                        }, () => this.adaptiveRequestHandler({
                            id: crawlingContext.id,
                            session: crawlingContext.session,
                            proxyInfo: crawlingContext.proxyInfo,
                            request: crawlingContext.request,
                            response: {
                                url: crawlingContext.response.url(),
                                statusCode: crawlingContext.response.status(),
                                headers: crawlingContext.response.headers(),
                                trailers: {},
                                complete: true,
                                redirectUrls: [],
                            },
                            log: crawlingContext.log,
                            page: crawlingContext.page,
                            querySelector: async (selector, timeoutMs = 5000) => {
                                const locator = playwrightContext.page.locator(selector).first();
                                await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
                                const $ = await playwrightContext.parseWithCheerio();
                                return $(selector);
                            },
                            async waitForSelector(selector, timeoutMs = 5000) {
                                const locator = playwrightContext.page.locator(selector).first();
                                await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
                            },
                            async parseWithCheerio(selector, timeoutMs = 5000) {
                                if (selector) {
                                    const locator = playwrightContext.page.locator(selector).first();
                                    await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
                                }
                                return playwrightContext.parseWithCheerio();
                            },
                            enqueueLinks: async (options = {}, timeoutMs = 5000) => {
                                let urls;
                                if (options.urls === undefined) {
                                    const selector = options.selector ?? 'a';
                                    const locator = playwrightContext.page.locator(selector).first();
                                    await locator.waitFor({ timeout: timeoutMs, state: 'attached' });
                                    urls = await (0, browser_1.extractUrlsFromPage)(playwrightContext.page, selector, options.baseUrl ??
                                        playwrightContext.request.loadedUrl ??
                                        playwrightContext.request.url);
                                }
                                else {
                                    urls = options.urls;
                                }
                                return await this.enqueueLinks({ ...options, urls }, crawlingContext.request, result);
                            },
                            addRequests: result.addRequests,
                            pushData: result.pushData,
                            useState: this.allowStorageAccess(async (defaultValue) => {
                                const state = await result.useState(defaultValue);
                                if (initialStateCopy === undefined) {
                                    initialStateCopy = JSON.parse(JSON.stringify(state));
                                }
                                return state;
                            }),
                            getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
                        }));
                    }
                    return Reflect.get(target, propertyName, receiver);
                },
            }), crawlingContext);
            return { result: { result, ok: true }, initialStateCopy };
        }
        catch (error) {
            return { result: { error, ok: false }, initialStateCopy };
        }
    }
    async runRequestHandlerWithPlainHTTP(crawlingContext, oldStateCopy) {
        const result = new core_1.RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY);
        const logs = [];
        const pageGotoOptions = { timeout: this.navigationTimeoutMillis }; // Irrelevant, but required by BrowserCrawler
        try {
            await (0, core_1.withCheckedStorageAccess)(() => {
                if (this.preventDirectStorageAccess) {
                    throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler');
                }
            }, async () => (0, timeout_1.addTimeoutToPromise)(async () => {
                const hookContext = {
                    id: crawlingContext.id,
                    session: crawlingContext.session,
                    proxyInfo: crawlingContext.proxyInfo,
                    request: crawlingContext.request,
                    log: this.createLogProxy(crawlingContext.log, logs),
                };
                await this._executeHooks(this.preNavigationHooks, {
                    ...hookContext,
                    get page() {
                        throw new Error('Page object was used in HTTP-only pre-navigation hook');
                    },
                }, // This is safe because `executeHooks` just passes the context to the hooks which accept the partial context
                pageGotoOptions);
                const response = await crawlingContext.sendRequest({});
                const loadedUrl = response.url;
                crawlingContext.request.loadedUrl = loadedUrl;
                if (!this.requestMatchesEnqueueStrategy(crawlingContext.request)) {
                    const request = crawlingContext.request;
                    this.log.debug(
                    // eslint-disable-next-line dot-notation
                    `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`);
                    request.noRetry = true;
                    request.state = core_1.RequestState.SKIPPED;
                    await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
                    return;
                }
                const $ = (0, cheerio_1.load)(response.body);
                await this.adaptiveRequestHandler({
                    ...hookContext,
                    request: crawlingContext.request,
                    response,
                    get page() {
                        throw new Error('Page object was used in HTTP-only request handler');
                    },
                    async querySelector(selector, _timeoutMs) {
                        return $(selector);
                    },
                    async waitForSelector(selector, _timeoutMs) {
                        if ($(selector).get().length === 0) {
                            throw new Error(`Selector '${selector}' not found.`);
                        }
                    },
                    async parseWithCheerio(selector, _timeoutMs) {
                        if (selector && $(selector).get().length === 0) {
                            throw new Error(`Selector '${selector}' not found.`);
                        }
                        return $;
                    },
                    enqueueLinks: async (options = {}) => {
                        const urls = options.urls ??
                            (0, utils_1.extractUrlsFromCheerio)($, options.selector, options.baseUrl ?? loadedUrl);
                        return this.enqueueLinks({ ...options, urls }, crawlingContext.request, result);
                    },
                    addRequests: result.addRequests,
                    pushData: result.pushData,
                    useState: async (defaultValue) => {
                        // return the old state before the browser handler was executed
                        // when rerunning the handler via HTTP for detection
                        if (oldStateCopy !== undefined) {
                            return oldStateCopy ?? defaultValue; // fallback to the default for `null`
                        }
                        return this.allowStorageAccess(result.useState)(defaultValue);
                    },
                    getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore),
                });
                await this._executeHooks(this.postNavigationHooks, crawlingContext, pageGotoOptions);
            }, this.requestHandlerTimeoutInnerMillis, 'Request handler timed out'));
            return { result, logs, ok: true };
        }
        catch (error) {
            return { error, logs, ok: false };
        }
    }
    async enqueueLinks(options, request, result) {
        const baseUrl = (0, core_1.resolveBaseUrlForEnqueueLinksFiltering)({
            enqueueStrategy: options?.strategy,
            finalRequestUrl: request.loadedUrl,
            originalRequestUrl: request.url,
            userProvidedBaseUrl: options?.baseUrl,
        });
        return await (0, core_1.enqueueLinks)({
            limit: this.calculateEnqueuedRequestLimit(options.limit),
            onSkippedRequest: this.handleSkippedRequest,
            ...options,
            baseUrl,
            requestQueue: {
                addRequestsBatched: async (requests) => {
                    await result.addRequests(requests);
                    return {
                        addedRequests: requests.map(({ uniqueKey, id }) => ({
                            uniqueKey,
                            requestId: id ?? '',
                            wasAlreadyPresent: false,
                            wasAlreadyHandled: false,
                        })),
                        waitForAllRequestsToBeAdded: Promise.resolve([]),
                    };
                },
            },
        });
    }
    createLogProxy(log, logs) {
        return new Proxy(log, {
            get(target, propertyName, receiver) {
                if (proxyLogMethods.includes(propertyName)) {
                    return (...args) => {
                        logs.push([target, propertyName, ...args]);
                    };
                }
                return Reflect.get(target, propertyName, receiver);
            },
        });
    }
}
exports.AdaptivePlaywrightCrawler = AdaptivePlaywrightCrawler;
function createAdaptivePlaywrightRouter(routes) {
    return core_1.Router.create(routes);
}
//# sourceMappingURL=adaptive-playwright-crawler.js.map