User:Bibipi/LinkArchiver.py
Jump to navigation
Jump to search
This script requires Python 3 and pip3 install --user mwclient mwparserfromhell waybackpy
. Consider running the script with logging: python3 LinkArchiver.py |& tee -a LinkArchiver.log
import mwparserfromhell import mwclient as mw from pathlib import Path import waybackpy import requests user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" s = mw.Site('incels.wiki', path='/') urls = [] for page in s.allpages(): if page.redirects_to() is not None: continue print("Processing page", page.name) doc = mwparserfromhell.parse(page.text()) for l in doc.ifilter_external_links(): url = str(l.url).split('#')[0] if url in urls: continue urls.append(url) print(url) try: r = requests.get(url, headers={'User-Agent': user_agent}, allow_redirects=True) if r.status_code != 200: print('Broken link!') else: wbobj = waybackpy.Url(url, user_agent) archive = wbobj.save() print(archive) except Exception as e: print(e) print(len(urls), 'unique URLs found.')