JSPM

  • ESM via JSPM
  • ES Module Entrypoint
  • Export Map
  • Keywords
  • License
  • Repository URL
  • TypeScript Types
  • README
  • Created
  • Published
  • Downloads 100
  • Score
    100M100P100Q56872F
  • License MIT

An `URL` parser for crawling purpose.

Package Exports

  • crawler-url-parser

This package does not declare an exports field, so the exports above have been automatically detected and optimized by JSPM instead. If any package subpath is missing, it is recommended to post an issue to the original package (crawler-url-parser) to support the "exports" field. If that is not possible, create a JSPM override to customize the exports field for this package.

Readme

crawler-url-parser

An URL parser for crawling purpose

version downloads node status

Installation

npm install crawler-url-parser

Usage

Parse

const cup = require('crawler-url-parser');

//// parse(current_url[,base_url])
let result = cup.parse("http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");

console.log(result.url);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2

console.log(result.baseurl);
// null

console.log(result.normalized);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2

console.log(result.host); 
// question.stackoverflow.com

console.log(result.domain); 
// stackoverflow.com

console.log(result.subdomain); 
// question

console.log(result.protocol); 
// http:

console.log(result.path); 
// /aaa/bbb/ddd

console.log(result.search); 
// q1=query1&q2=query2

console.log(result.querycount); 
// 2

Parse with baseURL

const cup = require('crawler-url-parser');

//// parse(current_url[,base_url])
let result = cup.parse("../ddd?q1=query1&q2=query2","http://question.stackoverflow.com/aaa/bbb/ccc/");

console.log(result.url);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2

console.log(result.baseurl);
// http://question.stackoverflow.com/aaa/bbb/ccc

console.log(result.normalized);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2

console.log(result.host); 
// question.stackoverflow.com

console.log(result.domain); 
// stackoverflow.com

console.log(result.subdomain); 
// question

console.log(result.protocol); 
// http:

console.log(result.path); 
// /aaa/bbb/ddd

console.log(result.search); 
// q1=query1&q2=query2

console.log(result.querycount); 
// 2

Extract

const cup = require('crawler-url-parser');

//// extract(html_str,current_url);
let htmlStr='<html><body> \
    <a href="http://best.question.stackoverflow.com">subdomain</a><br /> \
    <a href="http://faq.stackoverflow.com">subdomain</a><br /> \
    <a href="http://stackoverflow.com">updomain</a><br /> \
    <a href="http://www.google.com">external</a><br /> \
    <a href="http://www.facebook.com">external</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/bbb/ccc">sublevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/bbb/zzz">sublevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/">uplevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/ddd">samelevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/eee">samelevel</a><br /> \
    <a href="http://question.stackoverflow.com/aaa/ddd/eee">internal</a><br /> \
    <a href="http://question.stackoverflow.com/zzz">internal</a><br /> \
</body></html>';

let currentUrl= "http://question.stackoverflow.com/aaa/bbb";
let urls = cup.extract(htmlStr,currentUrl);

console.log(urls[0].type); //subdomain
console.log(urls[1].type); //subdomain
console.log(urls[2].type); //updomain
console.log(urls[3].type); //external
console.log(urls[4].type); //external
console.log(urls[5].type); //sublevel
console.log(urls[6].type); //sublevel
console.log(urls[7].type); //uplevel
console.log(urls[8].type); //samelevel
console.log(urls[9].type); //samelevel
console.log(urls[10].type); //internal
console.log(urls[11].type); //subdomain

Level

const cup = require('crawler-url-parser');

//// gettype(current_url,base_url);
let level = cup.gettype("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc");
console.log(level); //sublevel

level = cup.gettype("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc");
console.log(level); //uplevel

level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc");
console.log(level); //samelevel

level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc");
console.log(level); //external

Test

Support

I use this package actively myself, so it has my top priority. You can chat on WhatsApp about any infos, ideas and suggestions.

WhatsApp

Submitting an Issue

If you find a bug or a mistake, you can help by submitting an issue to GitLab Repository

Creating a Merge Request

GitLab calls it merge request instead of pull request.

License

MIT licensed and all it's dependencies are MIT or BSD licensed.