JSPM

  • ESM via JSPM
  • ES Module Entrypoint
  • Export Map
  • Keywords
  • License
  • Repository URL
  • TypeScript Types
  • README
  • Created
  • Published
  • Downloads 30
  • Score
    100M100P100Q61429F
  • License MIT

An `URL` parser for crawling purpose.

Package Exports

  • crawler-url-parser

This package does not declare an exports field, so the exports above have been automatically detected and optimized by JSPM instead. If any package subpath is missing, it is recommended to post an issue to the original package (crawler-url-parser) to support the "exports" field. If that is not possible, create a JSPM override to customize the exports field for this package.

Readme

crawler-url-parser

An URL parser for crawling purpose

logo

version downloads node status

Installation

npm install crawler-url-parser

Usage

Parse

const cup = require('crawler-url-parser');

//// parse(current_url[,base_url])
let result = cup.parse("http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");

console.log(result.url);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2

console.log(result.baseurl);
// null

console.log(result.normalized);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2

console.log(result.host); 
// question.stackoverflow.com

console.log(result.domain); 
// stackoverflow.com

console.log(result.subdomain); 
// question

console.log(result.protocol); 
// http:

console.log(result.path); 
// /aaa/bbb/ddd

console.log(result.search); 
// q1=query1&q2=query2

console.log(result.querycount); 
// 2

Parse with baseURL

const cup = require('crawler-url-parser');

//// parse(current_url[,base_url])
let result = cup.parse("../ddd?q1=query1&q2=query2","http://question.stackoverflow.com/aaa/bbb/ccc/");

console.log(result.url);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2

console.log(result.baseurl);
// http://question.stackoverflow.com/aaa/bbb/ccc

console.log(result.normalized);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2

console.log(result.host); 
// question.stackoverflow.com

console.log(result.domain); 
// stackoverflow.com

console.log(result.subdomain); 
// question

console.log(result.protocol); 
// http:

console.log(result.path); 
// /aaa/bbb/ddd

console.log(result.search); 
// q1=query1&q2=query2

console.log(result.querycount); 
// 2

Extract

const cup = require('crawler-url-parser');

//// extract(html_str,current_url);
let htmlStr='<html><body> \
    <a href="http://best.question.stackoverflow.com">subdomain</a><br /> \
    <a href="http://faq.stackoverflow.com">subdomain</a><br /> \
    <a href="http://stackoverflow.com">updomain</a><br /> \
    <a href="http://www.google.com">external</a><br /> \
    <a href="http://www.facebook.com">external</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/bbb/ccc">sublevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/bbb/zzz">sublevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/">uplevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/ddd">samelevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/eee">samelevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/ddd/eee">internal</a><br /> \
    <a href="http://question.stackoverflow.com/zzz">internal</a><br /> \
</body></html>';

let currentUrl= "http://question.stackoverflow.com/aaa/bbb";
let urls = cup.extract(htmlStr,currentUrl);

console.log(urls[0].type); //subdomain
console.log(urls[1].type); //subdomain
console.log(urls[2].type); //updomain
console.log(urls[3].type); //external
console.log(urls[4].type); //external
console.log(urls[5].type); //sublevel
console.log(urls[6].type); //sublevel
console.log(urls[7].type); //uplevel
console.log(urls[8].type); //samelevel
console.log(urls[9].type); //samelevel
console.log(urls[10].type); //internal
console.log(urls[11].type); //subdomain

Level

const cup = require('crawler-url-parser');

//// gettype(current_url,base_url);
let level = cup.gettype("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc");
console.log(level); //sublevel

level = cup.gettype("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc");
console.log(level); //uplevel

level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc");
console.log(level); //samelevel

level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc");
console.log(level); //external

Test

mocha or npm test

more than 200 unit test cases. check test folder and QUICKSTART.js for extra usage.