User:Bibipi/LinkArchiver.py

From Incel Wiki
Jump to navigation Jump to search

This script requires Python 3 and pip3 install --user mwclient mwparserfromhell waybackpy. Consider running the script with logging: python3 LinkArchiver.py |& tee -a LinkArchiver.log

import mwparserfromhell
import mwclient as mw
from pathlib import Path
import waybackpy
import requests

user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"

s = mw.Site('incels.wiki', path='/')

urls = []
for page in s.allpages():
    if page.redirects_to() is not None:
       continue
    print("Processing page", page.name)
    doc = mwparserfromhell.parse(page.text())
    
    for l in doc.ifilter_external_links():
        url = str(l.url).split('#')[0]
        if url in urls:
            continue
        urls.append(url)
        print(url)
        try:
            r = requests.get(url, headers={'User-Agent': user_agent}, allow_redirects=True)
            if r.status_code != 200:
                print('Broken link!')
            else:
                wbobj = waybackpy.Url(url, user_agent)
                archive = wbobj.save()
                print(archive)
        except Exception as e:
            print(e)

print(len(urls), 'unique URLs found.')