Package Exports
- node-warc
This package does not declare an exports field, so the exports above have been automatically detected and optimized by JSPM instead. If any package subpath is missing, it is recommended to post an issue to the original package (node-warc) to support the "exports" field. If that is not possible, create a JSPM override to customize the exports field for this package.
Readme
node-warc
Parse Web Archive (WARC) files or create WARC files using
Run npm install node-warc or yarn add node-warc to ge started
Documentation
Full documentation available at n0tan3rd.github.io/node-warc
Parsing
Using async iteration
Requires node 10 or greater
const fs = require('fs')
const zlib = require('zlib')
// recordIterator only exported if async iteration on readable streams is available
const { recordIterator } = require('node-warc')
async function iterateRecords (warcStream) {
for await (const record of recordIterator(warcStream)) {
console.log(record)
}
}
iterateRecords(
fs.createReadStream('<path-to-gzipd-warcfile>').pipe(zlib.createGunzip())
).then(() => {
console.log('done')
})Or using one of the parsers
for await (const record of new AutoWARCParser('<path-to-warcfile>')) {
console.log(record)
}Using Stream Transform
const fs = require('fs')
const { WARCStreamTransform } = require('node-warc')
fs
.createReadStream('<path-to-warcfile>')
.pipe(new WARCStreamTransform())
.on('data', record => {
console.log(record)
})Both .warc and .warc.gz
const { AutoWARCParser } = require('node-warc')
const parser = new AutoWARCParser('<path-to-warcfile>')
parser.on('record', record => { console.log(record) })
parser.on('done', () => { console.log('finished') })
parser.on('error', error => { console.error(error) })
parser.start()Only gzip'd warc files
const { WARCGzParser } = require('node-warc')
const parser = new WARCGzParser('<path-to-warcfile>')
parser.on('record', record => { console.log(record) })
parser.on('done', () => { console.log('finished') })
parser.on('error', error => { console.error(error) })
parser.start()Only non gzip'd warc files
const { WARCGzParser } = require('node-warc')
const parser = new WARCParser('<path-to-gzipd-warcfile>')
parser.on('record', record => { console.log(record) })
parser.on('done', () => { console.log('finished') })
parser.on('error', error => { console.error(error) })
parser.start()WARC Creation
Environment
NODEWARC_WRITE_GZIPPED- enable writing gzipped records to WARC outputs.
Examples
Using chrome-remote-interface
const CRI = require('chrome-remote-interface')
const { RemoteChromeWARCGenerator, RemoteChromeCapturer } = require('node-warc')
;(async () => {
const client = await CRI()
await Promise.all([
client.Page.enable(),
client.Network.enable(),
])
const cap = new RemoteChromeCapturer(client.Network)
cap.startCapturing()
await client.Page.navigate({ url: 'http://example.com' });
// actual code should wait for a better stopping condition, eg. network idle
await client.Page.loadEventFired()
const warcGen = new RemoteChromeWARCGenerator()
await warcGen.generateWARC(cap, client.Network, {
warcOpts: {
warcPath: 'myWARC.warc'
},
winfo: {
description: 'I created a warc!',
isPartOf: 'My awesome pywb collection'
}
})
await client.close()
})()Using chrome-remote-interface-extra
const { CRIExtra, Events, Page } = require('chrome-remote-interface-extra')
const { CRIExtraWARCGenerator, CRIExtraCapturer } = require('node-warc')
;(async () => {
let client
try {
// connect to endpoint
client = await CRIExtra({ host: 'localhost', port: 9222 })
const page = await Page.create(client)
const cap = new CRIExtraCapturer(page, Events.Page.Request)
cap.startCapturing()
await page.goto('https://example.com', { waitUntil: 'networkIdle' })
const warcGen = new CRIExtraWARCGenerator()
await warcGen.generateWARC(cap, {
warcOpts: {
warcPath: 'myWARC.warc'
},
winfo: {
description: 'I created a warc!',
isPartOf: 'My awesome pywb collection'
}
})
} catch (err) {
console.error(err)
} finally {
if (client) {
await client.close()
}
}
})()Using Puppeteer
const puppeteer = require('puppeteer')
const { Events } = require('puppeteer')
const { PuppeteerWARCGenerator, PuppeteerCapturer } = require('node-warc')
;(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
const cap = new PuppeteerCapturer(page, Events.Page.Request)
cap.startCapturing()
await page.goto('http://example.com', { waitUntil: 'networkidle0' })
const warcGen = new PuppeteerWARCGenerator()
await warcGen.generateWARC(cap, {
warcOpts: {
warcPath: 'myWARC.warc'
},
winfo: {
description: 'I created a warc!',
isPartOf: 'My awesome pywb collection'
}
})
await page.close()
await browser.close()
})()Note
The generateWARC method used in the preceding examples is helper function for making the WARC generation process simple. See its implementation for a full example of WARC generation using node-warc
Or see one of the crawler implementations provided by Squidwarc.