How do I start scraping with 1.13.0 #question #2103

Stump-Mt-Firewood · 2024-11-22T14:54:59Z

Because zimit can't Convert addresses with ":", e.g."https://supermemo.guru/wiki/Special:RecentChangesLinked/File:Learning_process_simulation.jpg".
Come to mwoffliner on recommendation.
Recent question: openzim/zimit#435

But some trouble, to the crawler command flow do not know how to enter.

Here is the command I want to type：

mwoffliner --mwUrl=https://supermemo.guru/ [email protected] --outputDirectory=/output_mw --speed=4 --getCategories --customZimFavicon=https://supermemo.guru/resources/assets/SuperMemory-Guru-logo.png --publisher=Stump --filenamePrefix=SuperMemo_guru_1 --customZimTitle=SuperMemo_guru --customZimDescription="SuperMemo_guru website save archive" --customZimTags=Knowledge --customMainPage=https://supermemo.guru/

Version:
Python 3.10.12
node v18.20.5
Mwoffliner: Github releases latest: 1.13.0

Network :

# ping supermemo.guru
PING supermemo.guru (209.124.66.20) 56(84) bytes of data.
64 bytes from nl1-ts108.a2hosting.com (209.124.66.20): icmp_seq=1 ttl=128 time=0.879 ms
64 bytes from nl1-ts108.a2hosting.com (209.124.66.20): icmp_seq=2 ttl=128 time=0.825 ms

Error message：


(node:4827) NOTE: The AWS SDK for JavaScript (v2) is in maintenance mode.
 SDK releases are limited to address critical bug fixes and security issues only.

Please migrate your code to use AWS SDK for JavaScript (v3).
For more information, check the blog post at https://a.co/cUPnyil
(Use `node --trace-warnings ...` to show where the warning was created)
[error] [2024-11-22T14:29:41.386Z] FATAL - Failed to get MediaWiki Metadata
[error] [2024-11-22T14:29:41.387Z] Failed to run mwoffliner after [13s]: {
	"message": "Request failed with status code 404",
	"name": "AxiosError",
	"stack": "AxiosError: Request failed with status code 404\n    at settle (file:///root/.local/share/fnm/node-versions/v18.20.5/installation/lib/node_modules/mwoffliner/node_modules/axios/lib/core/settle.js:19:12)\n    at IncomingMessage.handleStreamEnd (file:///root/.local/share/fnm/node-versions/v18.20.5/installation/lib/node_modules/mwoffliner/node_modules/axios/lib/adapters/http.js:599:11)\n    at IncomingMessage.emit (node:events:529:35)\n    at IncomingMessage.emit (node:domain:489:12)\n    at endReadableNT (node:internal/streams/readable:1400:12)\n    at process.processTicksAndRejections (node:internal/process/task_queues:82:21)\n    at Axios.request (file:///root/.local/share/fnm/node-versions/v18.20.5/installation/lib/node_modules/mwoffliner/node_modules/axios/lib/core/Axios.js:45:41)\n    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)",
	"config": {
		"transitional": {
			"silentJSONParsing": true,
			"forcedJSONParsing": true,
			"clarifyTimeoutError": false
		},
		"adapter": [
			"xhr",
			"http",
			"fetch"
		],
		"transformRequest": [
			null
		],
		"transformResponse": [
			null
		],
		"timeout": 120000,
		"xsrfCookieName": "XSRF-TOKEN",
		"xsrfHeaderName": "X-XSRF-TOKEN",
		"maxContentLength": -1,
		"maxBodyLength": -1,
		"env": {},
		"headers": {
			"Accept": "application/json",
			"cache-control": "public, max-stale=86400",
			"accept-encoding": "gzip, deflate",
			"user-agent": "MWOffliner/HEAD ([email protected])",
			"cookie": ""
		},
		"httpAgent": {
			"_events": {},
			"_eventsCount": 2,
			"defaultPort": 80,
			"protocol": "http:",
			"options": {
				"keepAlive": true,
				"noDelay": true,
				"path": null
			},
			"requests": {},
			"sockets": {},
			"freeSockets": {},
			"keepAliveMsecs": 1000,
			"keepAlive": true,
			"maxSockets": null,
			"maxFreeSockets": 256,
			"scheduling": "lifo",
			"maxTotalSockets": null,
			"totalSocketCount": 0
		},
		"httpsAgent": {
			"_events": {},
			"_eventsCount": 2,
			"defaultPort": 443,
			"protocol": "https:",
			"options": {
				"keepAlive": true,
				"noDelay": true,
				"path": null
			},
			"requests": {},
			"sockets": {},
			"freeSockets": {
				"supermemo.guru:443:::::::::::::::::::::": [
					{
						"_tlsOptions": {
							"pipe": false,
							"secureContext": {
								"context": {}
							},
							"isServer": false,
							"requestCert": true,
							"rejectUnauthorized": true
						},
						"_secureEstablished": true,
						"_securePending": false,
						"_newSessionPending": false,
						"_controlReleased": true,
						"secureConnecting": false,
						"_SNICallback": null,
						"servername": "supermemo.guru",
						"alpnProtocol": false,
						"authorized": true,
						"authorizationError": null,
						"encrypted": true,
						"_events": {
							"close": [
								null,
								null,
								null
							],
							"timeout": [
								null,
								null
							]
						},
						"_eventsCount": 9,
						"connecting": false,
						"_hadError": false,
						"_parent": null,
						"_host": "supermemo.guru",
						"_closeAfterHandlingError": false,
						"_readableState": {
							"state": 266328,
							"highWaterMark": 16384,
							"buffer": {
								"head": null,
								"tail": null,
								"length": 0
							},
							"length": 0,
							"pipes": [],
							"flowing": true,
							"errored": null,
							"defaultEncoding": "utf8",
							"awaitDrainWriters": null,
							"decoder": null,
							"encoding": null
						},
						"_writableState": {
							"objectMode": false,
							"highWaterMark": 16384,
							"finalCalled": false,
							"needDrain": false,
							"ending": false,
							"ended": false,
							"finished": false,
							"destroyed": false,
							"decodeStrings": false,
							"defaultEncoding": "utf8",
							"length": 0,
							"writing": false,
							"corked": 0,
							"sync": false,
							"bufferProcessing": false,
							"writecb": null,
							"writelen": 0,
							"afterWriteTickInfo": null,
							"buffered": [],
							"bufferedIndex": 0,
							"allBuffers": true,
							"allNoop": true,
							"pendingcb": 0,
							"constructed": true,
							"prefinished": false,
							"errorEmitted": false,
							"emitClose": false,
							"autoDestroy": true,
							"errored": null,
							"closed": false,
							"closeEmitted": false
						},
						"allowHalfOpen": false,
						"_sockname": null,
						"_pendingData": null,
						"_pendingEncoding": "",
						"_server": null,
						"ssl": {
							"_parent": {
								"reading": true,
								"onconnection": null
							},
							"_parentWrap": null,
							"_secureContext": {
								"context": {}
							},
							"reading": true
						},
						"_requestCert": true,
						"_rejectUnauthorized": true,
						"parser": null,
						"_httpMessage": null,
						"timeout": 0
					}
				]
			},
			"keepAliveMsecs": 1000,
			"keepAlive": true,
			"maxSockets": null,
			"maxFreeSockets": 256,
			"scheduling": "lifo",
			"maxTotalSockets": null,
			"totalSocketCount": 1,
			"maxCachedSessions": 100,
			"_sessionCache": {
				"map": {},
				"list": []
			}
		},
		"responseType": "json",
		"method": "get",
		"url": "https://supermemo.guru/w/api.php?action=query&meta=siteinfo&format=json&siprop=general|namespaces|statistics|variables|category|wikidesc"
	},
	"code": "ERR_BAD_REQUEST",
	"status": 404
}
[error] [2024-11-22T14:29:41.387Z] 

**********

Request failed with status code 404

**********

It looks like there is no access to the web page and denied access. Unlike zimit, the mandatory parameter --adminEmail is added, but I don't know how to enter it. Because supermemo.guru cannot be registered and entered at will.

So I want to ask how to set, need to configure into .js file, add API information and so on?

The text was updated successfully, but these errors were encountered:

Stump-Mt-Firewood · 2024-11-22T15:00:33Z

Or is mwoffliner unable to make an archive of supermemo.guru ? But zimit can still crawl. I don't think that's the right answer.

Stump-Mt-Firewood changed the title ~~How do I start scraping with 1.13.0~~ How do I start scraping with 1.13.0 #question Nov 22, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How do I start scraping with 1.13.0 #question #2103

How do I start scraping with 1.13.0 #question #2103

Stump-Mt-Firewood commented Nov 22, 2024 •

edited

Loading

Stump-Mt-Firewood commented Nov 22, 2024

How do I start scraping with 1.13.0 #question #2103

How do I start scraping with 1.13.0 #question #2103

Comments

Stump-Mt-Firewood commented Nov 22, 2024 • edited Loading

Stump-Mt-Firewood commented Nov 22, 2024

Stump-Mt-Firewood commented Nov 22, 2024 •

edited

Loading