JSPM

  • Created
  • Published
  • Downloads 2
  • Score
    100M100P100Q58923F
  • License Apache-2.0

Unified browser / HTML controller interfaces that support playwright, puppeteer and cheerio

Package Exports

    Readme

    You can use a free LetsScrapeData App if you want to scrape web data without programming.

    Please get help and discuss how to scrape a website on the discord server, which can respond quickly. It is better to submit issues on github for better tracking.

    Features

    This package is used by @letsscrapedata/scraper to facilitate switching between different types of browser controllers and to facilitate support for the new anti-bot browser controller without modifying existing programs.

    • Same interface of patchright, camoufox, playwright, puppeteer, cheerio: easy to switch between them
    • Web browsing automation: goto(open) / click / input / hover / select / scroll
    • State data management: cookies, localStorage, HTTP Headers, custom session data
    • Request and response interception management: data and HTTP headers
    • Elements selection by CSS selectors or XPath: whether in frames or not
    • Element's attributes: innerHtml, innerText, outerHtml, textContent, etc
    • Automatic file saving: such as screenshot, pdf
    • CDP message
    • Page evaluate
    • Completed the functions that are not supported by individual browser controllers or provided workarounds for known issues

    Install

    npm install @letsscrapedata/controller

    Examples

    import { controller } from "@letsscrapedata/controller";
    
    const browser = await controller.launch("patchright", "chromium", { headless: false });
    // const browser = await controller.launch("camoufox", "firefox", { headless: false });
    const browserContext = await browser.newBrowserContext();
    const page = await browserContext.getPage();
    
    await page.goto("https://www.letsscrapedata.com/pages/listexample1.html");
    await page.screenshot({path: "screenshot.png"});
    await browser.close();

    Same interfaces

    • LsdElement
    • LsdPage
    • LsdBrowserContext
    • LsdBrowser
    • LsdBrowserController

    LsdPage

    export interface LsdPage extends EventEmitter {
      /**
       * Get the LsdApiContext associated with this page's LsdBrowserContext
       * * only vaild in playwright
       */
      apiContext(): LsdApiContext;
    
      bringToFront(): Promise<boolean>;
    
      browserContext(): LsdBrowserContext;
    
      /**
       * clear the cookies of the current page(url)
       * * Prerequisites: page must has a valid url, such as by calling goto(url)
       */
      clearCookies(): Promise<boolean>;
    
      /**
       * clear the localStorage of the current page(url)
       * * Prerequisites: page must has a valid url, such as by calling goto(url)
       */
      clearLocalStorage(): Promise<boolean>;
    
      /**
       * Clear all request interceptions on the page
       */
      clearRequestInterceptions(): Promise<boolean>;
      /**
       * Clear all response interceptions on the page
       */
      clearResponseInterceptions(): Promise<boolean>;
    
      /**
       * clear the stateData of the current page(url):
       * * stateData: cookies, localStorage, indexedDB
       * * Prerequisites: page must has a valid url, such as by calling goto(url)
       */
      clearStateData(): Promise<boolean>;
    
      /**
       * Only free page can be closed!
       */
      close(): Promise<boolean>;
    
      /**
       * Get the full HTML content of the page or decendant frame
       * @param iframeOptions default [], selectors of decendant frames
       */
      content(iframeOptions?: IframeOption[]): Promise<string>;
    
      cookies(): Promise<CookieItem[]>;
    
      evalute(fun: Function, args?: any[]): Promise<any>;
    
      /**
       * @returns the first element matching the given CSS selector or XPath
       * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
       * @param iframeOptions default [], options to select decendant frame
       */
      findElement(selectorOrXpath: string | string[], iframeOptions?: IframeOption[]): Promise<LsdElement | null>;
    
      /**
       * @returns elements matching the given CSS selector or XPath
       * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
       * @param iframeOptions default [], options to select decendant frame
       */
    
      findElements(selectorOrXpath: string | string[], iframeOptions?: IframeOption[]): Promise<LsdElement[]>;
    
      /**
       * Free a busy page. All request and response interceptions will be cleared.
       */
      free(): Promise<boolean>;
    
      /**
       * @returns whether the element has the specified attribute or not 
       * @param attributeName 
       */
      goto(url: string, options?: GotoOptions): Promise<boolean>;
    
      id(): string;
    
      isFree(): boolean;
    
      /**
       * valid only in CheerioPage
       * @param html 
       * @param isHtml default true
       */
      load(html: string, isHtml?: boolean): boolean;
    
      localStroage(): Promise<LocalStorageOrigin[]>;
    
      mainFrame(): AllFrame;
    
      maximizeViewport(): Promise<boolean>;
    
      pageHeight(): Promise<number>;
    
      pageInfo(): PageInfo;
    
      pageWidth(): Promise<number>;
    
      pdf(options?: PDFOptions): Promise<Buffer>;
    
      screenshot(options?: ScreenshotOptions): Promise<Buffer>;
    
      scrollBy(x: number, y: number): Promise<boolean>;
    
      scrollTo(x: number, y: number): Promise<boolean>;
    
      /**
       * 
       * Send a CDP message over the current(not detached) or new CDP session
       * @param method protocol method name
       * @param params default null(ignored), method parameters
       * @param detach default true, whether to detach the CDPSession from target
       */
      sendCDPMessage(method: string, params?: object | null, detach?: boolean): Promise<any>;
    
      setCookies(cookies: CookieItem[]): Promise<boolean>;
    
      setExtraHTTPHeaders(headers: Record<string, string>): Promise<boolean>;
    
      /**
       * set localStorage on the current web page(page.url()) 
       * @param localStorageItems 
       */
      setLocalStroage(localStorageItems: LocalStorageItem[]): Promise<boolean>;
    
      setPageInfo(pageInfo: UpdatablePageInfo): boolean;
    
      /**
       * Intercept requests that meet the conditions(requestMatch) to perform an action(action and fulfill).
       * @param options
       */
      setRequestInterception(options: RequestInterceptionOption | RequestInterceptionOption[]): Promise<boolean>;
      /**
       * Intercept responses that meet the conditions(requestMatch and responseMatch) to perform actions(cacheArray and handler )
       * @param options
       */
      setResponseInterception(options: ResponseInterceptionOption | ResponseInterceptionOption[]): Promise<boolean>;
    
      /**
       * Shortcut for LsdPage.browserContext().setStateData(stateData)
       * @param stateData 
       */
      setStateData(stateData: BrowserStateData): Promise<boolean>;
    
      /**
       * valid only in puppeteer
       * @param userAgent
       */
      setUserAgent(userAgent: string): Promise<boolean>;
    
      setViewportSize(viewPortSize: ViewportSize): Promise<boolean>;
    
      stateData(): Promise<BrowserStateData>;
    
      status(): PageStatus;
    
      title(): Promise<string>;
    
      url(): string;
    
      /**
       * start to use this free page
       */
      use(): boolean;
    
      /**
       * 
       * @param selector CSS selector, not XPath
       * @param options 
       */
      waitForElement(selector: string, options?: WaitElementOptions): Promise<boolean>;
    
      /**
       * 
       * @param options 
       */
      waitForNavigation(options: WaitNavigationOptions): Promise<boolean>;
    
      /**
       * obj=window?.[key1]...?.[keyn]
       * @return obj ? JSON.stringify(obj) : ""
       * @param keys 
       */
      windowMember(keys: string[]): Promise<string>;
    
      _origPage(): AllPage;
    }

    LsdElement

    export interface LsdElement {
      ///////////////////////////////////////////////////////////////////////////////    methods used to extract data from the element
      /**
       * 
       * @return the value of a specified attribute on the element
       * @param attributeName 
       */
      attribute(attributeName: string): Promise<string>;
      /**
       * @returns the attribute names of the element
       */
      attributeNames(): Promise<string[]>;
      /**
       * @returns the first element matching the given CSS selector or XPath
       * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
       * @param iframeOptions default [], options to select decendant frame
       * @param absolute valid only if iframeOptions.length===0
       */
      findElement(selectorOrXpath: string | string[], iframeOptions?: IframeOption[], absolute?: boolean): Promise<LsdElement | null>;
      /**
       * @returns elements matching the given CSS selector or XPath
       * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
       * @param iframeOptions default [], options to select decendant frame
       * @param absolute valid only if iframeOptions.length===0
       */
      findElements(selectorOrXpath: string | string[], iframeOptions?: IframeOption[], absolute?: boolean): Promise<LsdElement[]>;
      /**
       * @returns whether the element has the specified attribute or not 
       * @param attributeName 
       */
      hasAttribute(attributeName: string): Promise<boolean>;
      /**
       * @returns the HTML or XML markup contained within the element
       */
      innerHtml(): Promise<string>;
    
      /**
       * @returns innerText of element
       * @param onlyChild default false, whether to include only the text of the child text nodes
       */
      innerText(onlyChild?: boolean): Promise<string>;
      /**
       * @returns the serialized HTML fragment describing the element including its descendants
       */
      outerHtml(): Promise<string>;
      textContent(): Promise<string>;
    
      ///////////////////////////////////////////////////////////////////////////////    methods to operate the element(only valid for browser)
      /**
       * Click this element.
       * @param options default {button: "left", count: 1, delay: 0, modifies: []}
       */
      click(options?: MouseClickOptions): Promise<boolean>;
      focus(): Promise<boolean>;
      hover(): Promise<boolean>;
      /**
       * * playwright: fill
       * * puppeteer: type
       */
      input(value: string, options?: InputOptions): Promise<boolean>;
      press(key: KeyInput, options: KeyPressOptions): Promise<boolean>;
      screenshot(options?: ScreenshotOptions): Promise<Buffer>;
      scrollIntoView(): Promise<boolean>;
      select(options: SelectOptions): Promise<boolean>;
      setAttribute(attributeName: string, newValue: string): Promise<boolean>;
      _origElement(): AllElement;
    }

    LsdBrowserContext

    export interface LsdBrowserContext extends EventEmitter {
      /**
       * Get the LsdApiContext associated with this LsdBrowserContext
       * * only vaild in playwright
       */
      apiContext(): LsdApiContext;
    
      browser(): LsdBrowser;
    
      close(): Promise<boolean>;
    
      /**
       * close pages that are free more than maxPageFreeSeconds if maxPageFreeSeconds > 0
       * * but the last page in the browserContext will not be closed
       * @default 0 the default maxPageFreeSeconds of the browserContext will be used
       */
      closeFreePages(maxPageFreeSeconds?: number): Promise<boolean>;
    
      /**
       * doest this browser meet browserContextRequirements (incognitos ignored in browser)?
       * @param browserContextRequirements
       */
      doesMeetBrowserContextRequirements(browserContextRequirements: BrowserContextRequirements): boolean;
    
      /**
       * get a free page from current pages or by creating a new page
       */
      getPage(always?: boolean): Promise<LsdPage | null>;
    
      /**
       * whether can get a number of free page(s)
       * * refer to getPage()
       * @param pageNum default 1, the number of free pages
       */
      hasFreePage(pageNum?: number): boolean;
    
      id(): string;
    
      isIncognito(): boolean;
    
      page(pageIdx: number): LsdPage | null;
    
      pages(): LsdPage[];
    
      proxy(): ProxyInController | null; // 备用
    
      setStateData(stateData: BrowserStateData): Promise<boolean>;
    
      _origBrowserContext(): AllBrowserContext;
    }
    

    LsdBrowser

    export interface LsdBrowser extends EventEmitter {
      // By default, constructor can be called in LsdBrowserController.launch/connect to create new instance
      // main methods
      newBrowserContext(options?: LsdBrowserContextOptions): Promise<LsdBrowserContext | null>;
      /**
       * 1. launched: close all browserContexts and this browser
       * 2. connected: 
       * * in puppeteer:  close all browserContexts and this browser???
       * * in playwright: only browserContexts created by newContext will be closed, browser is disconnected and browser will not be closed
       */
      close(): Promise<boolean>;
    
      // other methods
      browserContexts(): LsdBrowserContext[];
      browserControllerType(): BrowserControllerType;
      browserCreationMethod(): BrowserCreationMethod;
      browserType(): LsdBrowserType;
    
      /**
       * doest this browser meet browserContextRequirements (incognitos ignored in browser)?
       * @param browserContextRequirements
       */
      doesMeetBrowserContextRequirements(browserContextRequirements: BrowserContextRequirements): boolean;
    
      /**
       * @returns
       * 1. launched: actual executable path
       * 2. connected: exectuablePath in LsdConnectOptions, default ""(unkown)
       */
      executablePath(): string;
    
      id(): string;
      isConnected(): boolean;
      isHeadless(): boolean;
      options(): LsdLaunchOptions | LsdConnectOptions;
      /**
       * * puppeteer: return pid of connected or launched browser
       * * playwright: return pid of connected browser that is launched manually or using launchServer, or else return 0
       */
      pid(): number;
      /**
       * get the cpu utility(%) and memory usage(MB) of browser processes if pid is greater than 0 (refer to pid())
       */
      pidUsage(): Promise<{ cpu: number, memory: number }>;
      version(): Promise<string>; // playwright: sync; puppeteer: async
    
      _origBrowser(): AllBrowser;
    }

    LsdBrowserController

    export interface LsdBrowserController {
      /**
       * launch a new browser using related browser controller
       * @param browserControllerType 
       * @param browserType 
       * @param options 
       */
      launch(browserControllerType: BrowserControllerType, browserType: LsdBrowserType, options?: LsdLaunchOptions): Promise<LsdBrowser>;
    
      /**
       * connect to the current browser using related browser controller
       * @param browserControllerType 
       * @param browserType 
       * @param options 
       */
      connect(browserControllerType: BrowserControllerType, browserType: LsdBrowserType, options?: LsdConnectOptions): Promise<LsdBrowser>;
    
      /**
       * 
       * @param puppeteer  null means use puppeteer-extra-plugin-stealth based on puppeteer-extra
       */
      setPuppeteerNode(puppeteer: PuppeteerNode | null): boolean;
    
      /**
       * 
       * @param puppeteer  null means use puppeteer-extra-plugin-stealth based on playwright-extra
       */
      setPlaywrightBrowserType(browserType: LsdBrowserType, playwrightBrowserType: BrowserType | null): boolean;
    
      /**
       * Create a new LsdApiContext, valid in playwright;
       */
      newApiContext(options?: LsdApiContextOptions): Promise<LsdApiContext>;
    }