Skip to content

Commit 444f36a

Browse files
committed
[util]: generate sitemap script;
1 parent ac05733 commit 444f36a

1 file changed

Lines changed: 251 additions & 0 deletions

File tree

generate_sitemap.py

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Sitemap Generator for maifeeulasad.github.io
4+
5+
This script automatically generates a sitemap.xml file by scanning the website
6+
directory for HTML files and important assets like resume PDFs.
7+
"""
8+
9+
import os
10+
import sys
11+
from datetime import datetime
12+
from pathlib import Path
13+
import xml.etree.ElementTree as ET
14+
from xml.dom import minidom
15+
import re
16+
17+
18+
class SitemapGenerator:
19+
def __init__(self, base_url="https://maifeeulasad.github.io", base_dir="."):
20+
self.base_url = base_url.rstrip('/')
21+
self.base_dir = Path(base_dir)
22+
self.urls = []
23+
24+
# Define priority mappings
25+
self.priority_map = {
26+
'index.html': 1.0, # Homepage
27+
'showcase': 0.9, # Portfolio/showcase
28+
'resume': 0.8, # Resume files
29+
'Bangladesh': 0.7, # Special projects
30+
'gallery': 0.6, # Gallery
31+
'default': 0.5 # Other pages
32+
}
33+
34+
# Directories to exclude from sitemap
35+
self.exclude_dirs = {
36+
'.git', '.github', 'node_modules', '__pycache__',
37+
'.vscode', 'test' # Excluding test directory
38+
}
39+
40+
# Files to exclude
41+
self.exclude_files = {
42+
'gulpfile.js', 'package.json', 'pnpm-lock.yaml',
43+
'pnpm-workspace.yaml', 'generate_sitemap.py'
44+
}
45+
46+
def get_priority(self, path_str):
47+
"""Determine priority based on path."""
48+
path_lower = path_str.lower()
49+
50+
if 'index.html' in path_lower and path_str.count('/') <= 1:
51+
return self.priority_map['index.html']
52+
elif 'showcase' in path_lower:
53+
return self.priority_map['showcase']
54+
elif 'resume' in path_lower:
55+
return self.priority_map['resume']
56+
elif 'bangladesh' in path_lower:
57+
return self.priority_map['Bangladesh']
58+
elif 'gallery' in path_lower:
59+
return self.priority_map['gallery']
60+
else:
61+
return self.priority_map['default']
62+
63+
def should_exclude_path(self, path):
64+
"""Check if path should be excluded."""
65+
path_parts = path.parts
66+
67+
# Check if any part of the path is in exclude_dirs
68+
for part in path_parts:
69+
if part in self.exclude_dirs:
70+
return True
71+
72+
# Check if filename is in exclude_files
73+
if path.name in self.exclude_files:
74+
return True
75+
76+
return False
77+
78+
def get_file_modification_time(self, file_path):
79+
"""Get file modification time in ISO format."""
80+
try:
81+
mtime = os.path.getmtime(file_path)
82+
return datetime.fromtimestamp(mtime).strftime('%Y-%m-%dT%H:%M:%S+00:00')
83+
except:
84+
return datetime.now().strftime('%Y-%m-%dT%H:%M:%S+00:00')
85+
86+
def find_latest_resume(self):
87+
"""Find the most recent resume PDF."""
88+
resume_dir = self.base_dir / 'assets' / 'resume'
89+
if not resume_dir.exists():
90+
return None
91+
92+
pdf_files = list(resume_dir.glob('*.pdf'))
93+
if not pdf_files:
94+
return None
95+
96+
# Sort by modification time, get the most recent
97+
latest_resume = max(pdf_files, key=lambda f: f.stat().st_mtime)
98+
return latest_resume
99+
100+
def scan_html_files(self):
101+
"""Scan for all HTML files."""
102+
html_files = []
103+
104+
for html_file in self.base_dir.rglob('*.html'):
105+
if self.should_exclude_path(html_file):
106+
continue
107+
108+
# Convert to relative path from base directory
109+
rel_path = html_file.relative_to(self.base_dir)
110+
html_files.append(rel_path)
111+
112+
return sorted(html_files)
113+
114+
def create_url_entry(self, path, is_file=True):
115+
"""Create a URL entry for the sitemap."""
116+
if is_file:
117+
# For files, use the full path
118+
if path.name == 'index.html' and len(path.parts) == 1:
119+
# Root index.html should map to base URL
120+
url = f"{self.base_url}/"
121+
else:
122+
url = f"{self.base_url}/{path.as_posix()}"
123+
else:
124+
# For directories with index.html, use directory URL
125+
url = f"{self.base_url}/{path.as_posix()}/"
126+
127+
priority = self.get_priority(str(path))
128+
lastmod = self.get_file_modification_time(self.base_dir / path)
129+
130+
return {
131+
'loc': url,
132+
'lastmod': lastmod,
133+
'priority': f"{priority:.2f}"
134+
}
135+
136+
def generate_urls(self):
137+
"""Generate all URLs for the sitemap."""
138+
self.urls = []
139+
140+
# Add HTML files
141+
html_files = self.scan_html_files()
142+
143+
for html_file in html_files:
144+
if html_file.name == 'index.html' and len(html_file.parts) > 1:
145+
# For index.html in subdirectories, add both file and directory URL
146+
dir_path = html_file.parent
147+
self.urls.append(self.create_url_entry(dir_path, is_file=False))
148+
else:
149+
self.urls.append(self.create_url_entry(html_file, is_file=True))
150+
151+
# Add latest resume
152+
latest_resume = self.find_latest_resume()
153+
if latest_resume:
154+
resume_rel_path = latest_resume.relative_to(self.base_dir)
155+
self.urls.append(self.create_url_entry(resume_rel_path, is_file=True))
156+
157+
# Remove duplicates and sort by priority (descending) then by URL
158+
seen_urls = set()
159+
unique_urls = []
160+
161+
for url_entry in self.urls:
162+
if url_entry['loc'] not in seen_urls:
163+
seen_urls.add(url_entry['loc'])
164+
unique_urls.append(url_entry)
165+
166+
# Sort by priority (descending) then by URL
167+
self.urls = sorted(unique_urls,
168+
key=lambda x: (-float(x['priority']), x['loc']))
169+
170+
def generate_xml(self):
171+
"""Generate the sitemap XML."""
172+
# Create root element
173+
urlset = ET.Element('urlset')
174+
urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
175+
urlset.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
176+
urlset.set('xsi:schemaLocation',
177+
'http://www.sitemaps.org/schemas/sitemap/0.9 '
178+
'http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd')
179+
180+
# Add URL entries
181+
for url_data in self.urls:
182+
url_elem = ET.SubElement(urlset, 'url')
183+
184+
loc_elem = ET.SubElement(url_elem, 'loc')
185+
loc_elem.text = url_data['loc']
186+
187+
lastmod_elem = ET.SubElement(url_elem, 'lastmod')
188+
lastmod_elem.text = url_data['lastmod']
189+
190+
priority_elem = ET.SubElement(url_elem, 'priority')
191+
priority_elem.text = url_data['priority']
192+
193+
return urlset
194+
195+
def format_xml(self, element):
196+
"""Format XML with proper indentation."""
197+
rough_string = ET.tostring(element, 'unicode')
198+
reparsed = minidom.parseString(rough_string)
199+
return reparsed.toprettyxml(indent='\t')[23:] # Remove XML declaration
200+
201+
def generate_sitemap(self, output_file='sitemap.xml'):
202+
"""Generate the complete sitemap."""
203+
print("🔍 Scanning website directory...")
204+
self.generate_urls()
205+
206+
print(f"📄 Found {len(self.urls)} URLs to include in sitemap")
207+
for url_data in self.urls:
208+
print(f" • {url_data['loc']} (priority: {url_data['priority']})")
209+
210+
print("\n🏗️ Generating XML sitemap...")
211+
xml_element = self.generate_xml()
212+
xml_content = self.format_xml(xml_element)
213+
214+
# Write to file
215+
output_path = self.base_dir / output_file
216+
with open(output_path, 'w', encoding='utf-8') as f:
217+
f.write(xml_content)
218+
219+
print(f"✅ Sitemap generated successfully: {output_path}")
220+
print(f"📊 Total URLs: {len(self.urls)}")
221+
222+
return output_path
223+
224+
225+
def main():
226+
"""Main function."""
227+
print("🌐 Sitemap Generator for maifeeulasad.github.io")
228+
print("=" * 50)
229+
230+
# Get the directory where this script is located
231+
script_dir = Path(__file__).parent
232+
233+
# Initialize generator
234+
generator = SitemapGenerator(base_dir=script_dir)
235+
236+
# Generate sitemap
237+
try:
238+
sitemap_path = generator.generate_sitemap()
239+
print(f"\n🎉 Done! Sitemap saved to: {sitemap_path}")
240+
241+
# Show file size
242+
file_size = sitemap_path.stat().st_size
243+
print(f"📁 File size: {file_size:,} bytes")
244+
245+
except Exception as e:
246+
print(f"❌ Error generating sitemap: {e}")
247+
sys.exit(1)
248+
249+
250+
if __name__ == "__main__":
251+
main()

0 commit comments

Comments
 (0)