JSPM

  • ESM via JSPM
  • ES Module Entrypoint
  • Export Map
  • Keywords
  • License
  • Repository URL
  • TypeScript Types
  • README
  • Created
  • Published
  • Downloads 158
  • Score
    100M100P100Q87753F
  • License MIT

Parse And Write Web Archive Records (WARC) Files

Package Exports

  • node-warc

This package does not declare an exports field, so the exports above have been automatically detected and optimized by JSPM instead. If any package subpath is missing, it is recommended to post an issue to the original package (node-warc) to support the "exports" field. If that is not possible, create a JSPM override to customize the exports field for this package.

Readme

node-warc

Parse Web Archive (WARC) files or create WARC files using

Run npm install node-warc or yarn add node-warc to ge started

npm Package

Documentation

Full documentation available at n0tan3rd.github.io/node-warc

Parsing

Using async iteration

Requires node 10 or greater

const fs = require('fs')
const zlib = require('zlib')
// recordIterator only exported if async iteration on readable streams is available
const { recordIterator } = require('node-warc')

async function iterateRecords (warcStream) {
  for await (const record of recordIterator(warcStream)) {
    console.log(record)
  }
}

iterateRecords(
  fs.createReadStream('<path-to-gzipd-warcfile>').pipe(zlib.createGunzip())
).then(() => {
  console.log('done')
})

Or using one of the parsers

for await (const record of new AutoWARCParser('<path-to-warcfile>')) {
    console.log(record)
}

Using Stream Transform

const fs = require('fs')
const { WARCStreamTransform } = require('node-warc')

fs
  .createReadStream('<path-to-warcfile>')
  .pipe(new WARCStreamTransform())
  .on('data', record => {
    console.log(record)
  })

Both .warc and .warc.gz

const { AutoWARCParser } = require('node-warc')

const parser = new AutoWARCParser('<path-to-warcfile>')
parser.on('record', record => { console.log(record) })
parser.on('done', () => { console.log('finished') })
parser.on('error', error => { console.error(error) })
parser.start()

Only gzip'd warc files

const { WARCGzParser } = require('node-warc')

const parser = new WARCGzParser('<path-to-warcfile>')
parser.on('record', record => { console.log(record) })
parser.on('done', () => { console.log('finished') })
parser.on('error', error => { console.error(error) })
parser.start()

Only non gzip'd warc files

const { WARCGzParser } = require('node-warc')

const parser = new WARCParser('<path-to-gzipd-warcfile>')
parser.on('record', record => { console.log(record) })
parser.on('done', () => { console.log('finished') })
parser.on('error', error => { console.error(error) })
parser.start()

WARC Creation

Environment

  • NODEWARC_WRITE_GZIPPED - enable writing gzipped records to WARC outputs.

Examples

Using chrome-remote-interface

const CRI = require('chrome-remote-interface')
const { RemoteChromeWARCGenerator, RemoteChromeCapturer } = require('node-warc')

;(async () => {
  const client = await CRI()
  await Promise.all([
    client.Page.enable(),
    client.Network.enable(),
  ])
  const cap = new RemoteChromeCapturer(client.Network)
  cap.startCapturing()
  await client.Page.navigate({ url: 'http://example.com' });
  // actual code should wait for a better stopping condition, eg. network idle
  await client.Page.loadEventFired()
  const warcGen = new RemoteChromeWARCGenerator()
  await warcGen.generateWARC(cap, client.Network, {
    warcOpts: {
      warcPath: 'myWARC.warc'
    },
    winfo: {
      description: 'I created a warc!',
      isPartOf: 'My awesome pywb collection'
    }
  })
  await client.close()
})()

Using chrome-remote-interface-extra

const { CRIExtra, Events, Page } = require('chrome-remote-interface-extra')
const { CRIExtraWARCGenerator, CRIExtraCapturer } = require('node-warc')

;(async () => {
  let client
  try {
    // connect to endpoint
    client = await CRIExtra({ host: 'localhost', port: 9222 })
    const page = await Page.create(client)
    const cap = new CRIExtraCapturer(page, Events.Page.Request)
    cap.startCapturing()
    await page.goto('https://example.com', { waitUntil: 'networkIdle' })
    const warcGen = new CRIExtraWARCGenerator()
    await warcGen.generateWARC(cap, {
      warcOpts: {
        warcPath: 'myWARC.warc'
      },
      winfo: {
        description: 'I created a warc!',
        isPartOf: 'My awesome pywb collection'
      }
    })
  } catch (err) {
    console.error(err)
  } finally {
    if (client) {
      await client.close()
    }
  }
})()

Using Puppeteer

const puppeteer = require('puppeteer')
const { Events } = require('puppeteer')
const { PuppeteerWARCGenerator, PuppeteerCapturer } = require('node-warc')

;(async () => {
  const browser = await puppeteer.launch()
  const page = await browser.newPage()
  const cap = new PuppeteerCapturer(page, Events.Page.Request)
  cap.startCapturing()
  await page.goto('http://example.com', { waitUntil: 'networkidle0' })
  const warcGen = new PuppeteerWARCGenerator()
  await warcGen.generateWARC(cap, {
    warcOpts: {
      warcPath: 'myWARC.warc'
    },
    winfo: {
      description: 'I created a warc!',
      isPartOf: 'My awesome pywb collection'
    }
  })
  await page.close()
  await browser.close()
})()

Note

The generateWARC method used in the preceding examples is helper function for making the WARC generation process simple. See its implementation for a full example of WARC generation using node-warc

Or see one of the crawler implementations provided by Squidwarc.