validate-links.py 1 KB
Newer Older
1 2 3 4 5 6 7 8 9


import requests
import glob
import sys
from urllib.parse import *
import re

DEFAULT_FILES = glob.glob('../site/*/*.html')
10

11
REGEX_HREF = re.compile('href="(.*?)"')
12
REGEX_ID = re.compile('id="(.*?)"')
13 14 15 16 17 18 19

STATUS = {
    None: '? ',
    False: 'KO',
    True: 'ok'
}

20 21 22 23 24 25 26
def check_url(url, ids=[]):

    # Internal links
    if url.startswith('#'):
        return (not url[1:]) or (url[1:] in ids)

    # Relative links: TODO
27 28
    if not url.startswith('http'):
        return None
29 30

    # External http(s) links
31 32 33 34 35 36 37 38 39 40
    try:
        req = requests.get(url)
        return (req.status_code < 400)
    except:
        return False
    

def check_file(f):
    print('<-- ', f)
    content = ''.join(open(f).readlines())
41 42 43

    ids = REGEX_ID.findall(content)

44
    for url in REGEX_HREF.findall(content):
45
        ok = check_url(url, ids)
46 47 48 49 50 51 52 53 54 55
        print(STATUS[ok] + '    ' + url)
    print()
    
if __name__ == '__main__':

    files = sys.argv[1:] if len(sys.argv) > 1 else DEFAULT_FILES

    for f in files:
        check_file(f)