User:Bibipi/LinkArchiver.py
Jump to navigation
Jump to search
This script requires Python 3 and pip3 install --user mwclient mwparserfromhell waybackpy. Consider running the script with logging: python3 LinkArchiver.py |& tee -a LinkArchiver.log
import mwparserfromhell
import mwclient as mw
from pathlib import Path
import waybackpy
import requests
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
s = mw.Site('incels.wiki', path='/')
urls = []
for page in s.allpages():
if page.redirects_to() is not None:
continue
print("Processing page", page.name)
doc = mwparserfromhell.parse(page.text())
for l in doc.ifilter_external_links():
url = str(l.url).split('#')[0]
if url in urls:
continue
urls.append(url)
print(url)
try:
r = requests.get(url, headers={'User-Agent': user_agent}, allow_redirects=True)
if r.status_code != 200:
print('Broken link!')
else:
wbobj = waybackpy.Url(url, user_agent)
archive = wbobj.save()
print(archive)
except Exception as e:
print(e)
print(len(urls), 'unique URLs found.')