-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcheck_links.py
More file actions
213 lines (163 loc) · 6.29 KB
/
check_links.py
File metadata and controls
213 lines (163 loc) · 6.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python3
"""
Simple script to check internal links with HTTP requests.
"""
import json
import re
import sys
from pathlib import Path
from urllib.parse import urljoin
import requests
def is_external_link(url):
"""Check if a URL is external (http, https, mailto, tel)."""
return url.startswith(("http://", "https://", "mailto:", "tel:"))
def convert_md_to_html(url):
"""Convert .md URLs to .html URLs."""
return url.replace(".md", ".html") if ".md" in url else url
def find_internal_links(content):
"""Find all internal links in markdown and HTML content."""
links = []
# Common patterns for both markdown and HTML links
patterns = [
(r"\[([^\]]+)\]\(([^)]+)\)", "markdown"), # [text](url)
(r'<a\s+href=["\']([^"\']+)["\'][^>]*>([^<]+)</a>', "html"),
]
for pattern, link_type in patterns:
for match in re.finditer(pattern, content):
if link_type == "markdown":
text, url = match.group(1), match.group(2)
else: # html
url, text = match.group(1), match.group(2).strip()
# Skip external links
if is_external_link(url):
continue
links.append((text, url, link_type, match.start()))
return links
def resolve_relative_url(base_url, current_file, link_url):
"""Resolve a relative URL from the current file's directory."""
if link_url.startswith("/"):
return urljoin(base_url, link_url)
# Get current file's directory
current_dir = str(Path(current_file).parent)
if current_dir != ".":
resolved_path = str(Path(current_dir) / link_url)
else:
resolved_path = link_url
# Ensure path starts with /
if not resolved_path.startswith("/"):
resolved_path = "/" + resolved_path
return urljoin(base_url, resolved_path)
def build_full_url(base_url, link_url, current_file):
"""Build the full URL for checking or display."""
if link_url.startswith("#"):
# Anchor link - resolve from current page
file_path = current_file.replace(".md", ".html")
if not file_path.startswith("/"):
file_path = "/" + file_path
return urljoin(base_url, file_path + link_url)
else:
# Regular link - convert .md to .html and resolve
converted_url = convert_md_to_html(link_url)
return resolve_relative_url(base_url, current_file, converted_url)
def check_link(base_url, link_url, current_file):
"""Check if a link returns 200 or 404."""
try:
full_url = build_full_url(base_url, link_url, current_file)
response = requests.get(full_url, timeout=5)
if response.status_code == 200:
return True, "200 OK"
elif response.status_code == 404:
return False, "404 Not Found"
else:
return False, f"HTTP {response.status_code}"
except requests.RequestException as e:
return False, f"Error: {e}"
def create_link_result(
md_file, docs_dir, text, url, link_type, line_start, content, status
):
"""Create a standardized link result dictionary."""
current_file = str(md_file.relative_to(docs_dir))
full_url = build_full_url("http://127.0.0.1:8000", url, current_file)
return {
"file": current_file,
"text": text,
"url": url,
"full_url": full_url,
"status": status,
"line": content[:line_start].count("\n") + 1,
"link_type": link_type,
}
def print_broken_links(broken_links):
"""Print broken links to console."""
if not broken_links:
return
print("\n🔴 BROKEN LINKS (showing first 10):")
print("-" * 50)
for link in broken_links[:10]:
print("📄 {}:{}".format(link["file"], link["line"]))
print(f" Text: {link['text']}")
print(f" URL: {link['url']}")
print(f" Full URL: {link['full_url']}")
print(f" Status: {link['status']}")
print()
def save_results(broken_links, working_links, docs_dir, base_url):
"""Save results to JSON file."""
results = {
"summary": {
"total_files_scanned": len(list(docs_dir.rglob("*.md"))),
"working_links": len(working_links),
"broken_links": len(broken_links),
"base_url": base_url,
},
"broken_links": broken_links,
"working_links": working_links,
}
output_file = "broken_links.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\n📄 Results saved to: {output_file}")
def main():
"""Main function to check all internal links."""
base_url = "http://127.0.0.1:8000"
docs_dir = Path("docs")
print(f"🔍 Checking internal links against {base_url}")
print("=" * 50)
broken_links = []
working_links = []
# Find all markdown files
for md_file in docs_dir.rglob("*.md"):
if "README.md" in md_file.name:
continue
try:
with open(md_file, "r", encoding="utf-8") as f:
content = f.read()
links = find_internal_links(content)
for text, url, link_type, line_start in links:
is_working, status = check_link(
base_url, url, str(md_file.relative_to(docs_dir))
)
result = create_link_result(
md_file, docs_dir, text, url, link_type, line_start, content, status
)
if is_working:
working_links.append(result)
else:
broken_links.append(result)
except (IOError, OSError) as e:
print(f"❌ Error reading {md_file}: {e}")
# Print summary
print(f"✅ Working links: {len(working_links)}")
print(f"❌ Broken links: {len(broken_links)}")
# Save results and print broken links
save_results(broken_links, working_links, docs_dir, base_url)
print_broken_links(broken_links)
return len(broken_links)
if __name__ == "__main__":
broken_count = main()
# Exit with error if there are broken links
if broken_count > 0:
print(f"\n💥 Script failing due to {broken_count} broken links!")
sys.exit(1)
else:
print("\n✅ Script completed successfully - all links working!")
sys.exit(0)