User:Bibipi/MediaArchiver.py

From Incel Wiki
Jump to navigation Jump to search

This script requires Python 3 and pip3 install --user mwclient mwparserfromhell youtube-dl and apt install ffmpeg. Consider running the script with logging: python3 MediaArchiver.py |& tee -a MediaArchiver.log

import mwparserfromhell
import mwclient as mw
import youtube_dl
from pathlib import Path

s = mw.Site('incels.wiki', path='/')

def download(url, page):
    print(url)
    p = Path("incelswiki-media/" + page.encode('utf8').decode('ascii', 'ignore')
             .replace('/', '_').replace(' ', '_').replace('%', '_'))
    try:
        ydl_opts = { 'outtmpl': str(p / '%(title)s-%(id)s.%(ext)s') }
        print(ydl_opts)
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
    except Exception as e:
        print(e)
    
for page in s.allpages():
    if page.redirects_to() is not None:
       continue
    print("Processing page", page.name)
    doc = mwparserfromhell.parse(page.text())
    
    urls = []
    for t in doc.ifilter_templates(doc.RECURSE_OTHERS, matches=r'^{{#ev'):
        url = str(t.name).split(':', 1)[1].strip() if str(t.name).startswith('#evu:') else str(t.params[0])
        urls.append(url)
        download(url, page.name)
    for ta in doc.ifilter_tags(matches=r'^<(youtube|embedvideo)'):
        url = str(ta.contents)
        urls.append(url)
        download(url, page.name)
    for l in doc.ifilter_external_links():
        url = str(l.url)
        if any(u.startswith(url) for u in urls):
            continue
        download(url, page.name)