Create Web Crawler Job
Creates a web crawler job whose objective is to crawl the provided URLs/sitemaps and generate corresponding webpages as artifacts.
All the DevRev APIs require a token to authenticate the user. Provide Authorization: Bearer <TOKEN>
as a header to every API request.
How do I find my token?
In: header
The regex a URL must satisfy to be crawled.
text
The list of regexes a URL must satisfy to be crawled.
items <= 100
The parts to which created webpage/articles during this crawler job will be linked to.
1 <= items <= 1
The description of the job.
text
The list of allowed domain names to crawl.
Number of days between re-sync job runs. If 0, the job will run only once.
int32
The maximum depth to crawl.
int32
Whether to notify the user when the job is complete. Default is true.
The regex which if satisfied by a URL results in rejection of the URL. If a URL matches both accept and reject regexes, it is rejected.
text
The list of regexes which if satisfied by a URL results in rejection of the URL. If a URL matches both accept and reject regexes, it is rejected.
items <= 100
The list of sitemap index URLs to crawl.
items <= 2
The list of sitemap URLs to crawl.
items <= 2
The list of URLs to crawl.
items <= 50
User agent to use for crawling websites in this job.
text
length <= 1024
Response Body
curl -X POST "https://api.devrev.ai/web-crawler-jobs.create" \ -H "Content-Type: application/json" \ -d '{ "applies_to_parts": [ "PROD-12345" ] }'
{
"web_crawler_job": {
"created_by": {
"type": "dev_user",
"display_id": "string",
"id": "string",
"display_name": "string",
"display_picture": {
"display_id": "string",
"id": "string",
"file": {
"type": "string",
"name": "string",
"size": 0
}
},
"email": "string",
"full_name": "string",
"state": "active"
},
"created_date": "2023-01-01T12:00:00.000Z",
"display_id": "string",
"id": "string",
"modified_by": {
"type": "dev_user",
"display_id": "string",
"id": "string",
"display_name": "string",
"display_picture": {
"display_id": "string",
"id": "string",
"file": {
"type": "string",
"name": "string",
"size": 0
}
},
"email": "string",
"full_name": "string",
"state": "active"
},
"modified_date": "2023-01-01T12:00:00.000Z",
"accept_regexs": [
"string"
],
"description": "string",
"domain_names": [
"string"
],
"frequency": 0,
"max_depth": 0,
"no_parent": true,
"notify_on_complete": true,
"num_bytes": 0,
"num_timeout_urls": 0,
"num_urls_scraped": 0,
"reject_regexs": [
"string"
],
"sitemap_index_urls": [
"string"
],
"sitemap_urls": [
"string"
],
"state": "aborted",
"urls": [
"string"
],
"user_agent": "string"
}
}
{
"detail": "string",
"message": "string",
"type": "artifact_already_attached_to_a_parent",
"existing_parent": "string",
"is_same": true
}
{
"detail": "string",
"message": "string",
"type": "unauthenticated"
}
{
"detail": "string",
"message": "string",
"type": "forbidden"
}
{
"detail": "string",
"message": "string",
"type": "too_many_requests",
"retry_after": 0
}
{
"detail": "string",
"message": "string",
"type": "internal_error",
"reference_id": "string"
}
{
"detail": "string",
"message": "string",
"type": "service_unavailable"
}