validate-links.py 1.29 KB
Newer Older
1 2 3 4 5 6 7


import requests
import glob
import sys
from urllib.parse import *
import re
8
from collections import defaultdict
9 10

DEFAULT_FILES = glob.glob('../site/*/*.html')
11

12
REGEX_HREF = re.compile('href="(.*?)"')
13
REGEX_ID = re.compile('id="(.*?)"')
14 15 16 17 18 19 20

STATUS = {
    None: '? ',
    False: 'KO',
    True: 'ok'
}

21 22
stats = defaultdict(int)

23 24 25 26 27 28 29
def check_url(url, ids=[]):

    # Internal links
    if url.startswith('#'):
        return (not url[1:]) or (url[1:] in ids)

    # Relative links: TODO
30 31
    if not url.startswith('http'):
        return None
32 33

    # External http(s) links
34 35 36 37 38 39 40 41 42 43
    try:
        req = requests.get(url)
        return (req.status_code < 400)
    except:
        return False
    

def check_file(f):
    print('<-- ', f)
    content = ''.join(open(f).readlines())
44 45 46

    ids = REGEX_ID.findall(content)

47
    for url in REGEX_HREF.findall(content):
48
        ok = check_url(url, ids)
49
        print(STATUS[ok] + '    ' + url)
50
        globals()['stats'][ok] += 1
51
    print()
52 53 54 55 56 57 58


def print_stats():
    print('==== Summary')
    for k, v in STATUS.items():
        print('  %s : %3d' % (v, globals()['stats'][k]))

59 60 61 62 63 64 65
    
if __name__ == '__main__':

    files = sys.argv[1:] if len(sys.argv) > 1 else DEFAULT_FILES

    for f in files:
        check_file(f)
66 67 68 69
    print_stats()

    if globals()['stats'][False]:
        sys.exit(1)