1+ #!/usr/bin/env python3
2+ """
3+ Sitemap Generator for maifeeulasad.github.io
4+
5+ This script automatically generates a sitemap.xml file by scanning the website
6+ directory for HTML files and important assets like resume PDFs.
7+ """
8+
9+ import os
10+ import sys
11+ from datetime import datetime
12+ from pathlib import Path
13+ import xml .etree .ElementTree as ET
14+ from xml .dom import minidom
15+ import re
16+
17+
18+ class SitemapGenerator :
19+ def __init__ (self , base_url = "https://maifeeulasad.github.io" , base_dir = "." ):
20+ self .base_url = base_url .rstrip ('/' )
21+ self .base_dir = Path (base_dir )
22+ self .urls = []
23+
24+ # Define priority mappings
25+ self .priority_map = {
26+ 'index.html' : 1.0 , # Homepage
27+ 'showcase' : 0.9 , # Portfolio/showcase
28+ 'resume' : 0.8 , # Resume files
29+ 'Bangladesh' : 0.7 , # Special projects
30+ 'gallery' : 0.6 , # Gallery
31+ 'default' : 0.5 # Other pages
32+ }
33+
34+ # Directories to exclude from sitemap
35+ self .exclude_dirs = {
36+ '.git' , '.github' , 'node_modules' , '__pycache__' ,
37+ '.vscode' , 'test' # Excluding test directory
38+ }
39+
40+ # Files to exclude
41+ self .exclude_files = {
42+ 'gulpfile.js' , 'package.json' , 'pnpm-lock.yaml' ,
43+ 'pnpm-workspace.yaml' , 'generate_sitemap.py'
44+ }
45+
46+ def get_priority (self , path_str ):
47+ """Determine priority based on path."""
48+ path_lower = path_str .lower ()
49+
50+ if 'index.html' in path_lower and path_str .count ('/' ) <= 1 :
51+ return self .priority_map ['index.html' ]
52+ elif 'showcase' in path_lower :
53+ return self .priority_map ['showcase' ]
54+ elif 'resume' in path_lower :
55+ return self .priority_map ['resume' ]
56+ elif 'bangladesh' in path_lower :
57+ return self .priority_map ['Bangladesh' ]
58+ elif 'gallery' in path_lower :
59+ return self .priority_map ['gallery' ]
60+ else :
61+ return self .priority_map ['default' ]
62+
63+ def should_exclude_path (self , path ):
64+ """Check if path should be excluded."""
65+ path_parts = path .parts
66+
67+ # Check if any part of the path is in exclude_dirs
68+ for part in path_parts :
69+ if part in self .exclude_dirs :
70+ return True
71+
72+ # Check if filename is in exclude_files
73+ if path .name in self .exclude_files :
74+ return True
75+
76+ return False
77+
78+ def get_file_modification_time (self , file_path ):
79+ """Get file modification time in ISO format."""
80+ try :
81+ mtime = os .path .getmtime (file_path )
82+ return datetime .fromtimestamp (mtime ).strftime ('%Y-%m-%dT%H:%M:%S+00:00' )
83+ except :
84+ return datetime .now ().strftime ('%Y-%m-%dT%H:%M:%S+00:00' )
85+
86+ def find_latest_resume (self ):
87+ """Find the most recent resume PDF."""
88+ resume_dir = self .base_dir / 'assets' / 'resume'
89+ if not resume_dir .exists ():
90+ return None
91+
92+ pdf_files = list (resume_dir .glob ('*.pdf' ))
93+ if not pdf_files :
94+ return None
95+
96+ # Sort by modification time, get the most recent
97+ latest_resume = max (pdf_files , key = lambda f : f .stat ().st_mtime )
98+ return latest_resume
99+
100+ def scan_html_files (self ):
101+ """Scan for all HTML files."""
102+ html_files = []
103+
104+ for html_file in self .base_dir .rglob ('*.html' ):
105+ if self .should_exclude_path (html_file ):
106+ continue
107+
108+ # Convert to relative path from base directory
109+ rel_path = html_file .relative_to (self .base_dir )
110+ html_files .append (rel_path )
111+
112+ return sorted (html_files )
113+
114+ def create_url_entry (self , path , is_file = True ):
115+ """Create a URL entry for the sitemap."""
116+ if is_file :
117+ # For files, use the full path
118+ if path .name == 'index.html' and len (path .parts ) == 1 :
119+ # Root index.html should map to base URL
120+ url = f"{ self .base_url } /"
121+ else :
122+ url = f"{ self .base_url } /{ path .as_posix ()} "
123+ else :
124+ # For directories with index.html, use directory URL
125+ url = f"{ self .base_url } /{ path .as_posix ()} /"
126+
127+ priority = self .get_priority (str (path ))
128+ lastmod = self .get_file_modification_time (self .base_dir / path )
129+
130+ return {
131+ 'loc' : url ,
132+ 'lastmod' : lastmod ,
133+ 'priority' : f"{ priority :.2f} "
134+ }
135+
136+ def generate_urls (self ):
137+ """Generate all URLs for the sitemap."""
138+ self .urls = []
139+
140+ # Add HTML files
141+ html_files = self .scan_html_files ()
142+
143+ for html_file in html_files :
144+ if html_file .name == 'index.html' and len (html_file .parts ) > 1 :
145+ # For index.html in subdirectories, add both file and directory URL
146+ dir_path = html_file .parent
147+ self .urls .append (self .create_url_entry (dir_path , is_file = False ))
148+ else :
149+ self .urls .append (self .create_url_entry (html_file , is_file = True ))
150+
151+ # Add latest resume
152+ latest_resume = self .find_latest_resume ()
153+ if latest_resume :
154+ resume_rel_path = latest_resume .relative_to (self .base_dir )
155+ self .urls .append (self .create_url_entry (resume_rel_path , is_file = True ))
156+
157+ # Remove duplicates and sort by priority (descending) then by URL
158+ seen_urls = set ()
159+ unique_urls = []
160+
161+ for url_entry in self .urls :
162+ if url_entry ['loc' ] not in seen_urls :
163+ seen_urls .add (url_entry ['loc' ])
164+ unique_urls .append (url_entry )
165+
166+ # Sort by priority (descending) then by URL
167+ self .urls = sorted (unique_urls ,
168+ key = lambda x : (- float (x ['priority' ]), x ['loc' ]))
169+
170+ def generate_xml (self ):
171+ """Generate the sitemap XML."""
172+ # Create root element
173+ urlset = ET .Element ('urlset' )
174+ urlset .set ('xmlns' , 'http://www.sitemaps.org/schemas/sitemap/0.9' )
175+ urlset .set ('xmlns:xsi' , 'http://www.w3.org/2001/XMLSchema-instance' )
176+ urlset .set ('xsi:schemaLocation' ,
177+ 'http://www.sitemaps.org/schemas/sitemap/0.9 '
178+ 'http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd' )
179+
180+ # Add URL entries
181+ for url_data in self .urls :
182+ url_elem = ET .SubElement (urlset , 'url' )
183+
184+ loc_elem = ET .SubElement (url_elem , 'loc' )
185+ loc_elem .text = url_data ['loc' ]
186+
187+ lastmod_elem = ET .SubElement (url_elem , 'lastmod' )
188+ lastmod_elem .text = url_data ['lastmod' ]
189+
190+ priority_elem = ET .SubElement (url_elem , 'priority' )
191+ priority_elem .text = url_data ['priority' ]
192+
193+ return urlset
194+
195+ def format_xml (self , element ):
196+ """Format XML with proper indentation."""
197+ rough_string = ET .tostring (element , 'unicode' )
198+ reparsed = minidom .parseString (rough_string )
199+ return reparsed .toprettyxml (indent = '\t ' )[23 :] # Remove XML declaration
200+
201+ def generate_sitemap (self , output_file = 'sitemap.xml' ):
202+ """Generate the complete sitemap."""
203+ print ("🔍 Scanning website directory..." )
204+ self .generate_urls ()
205+
206+ print (f"📄 Found { len (self .urls )} URLs to include in sitemap" )
207+ for url_data in self .urls :
208+ print (f" • { url_data ['loc' ]} (priority: { url_data ['priority' ]} )" )
209+
210+ print ("\n 🏗️ Generating XML sitemap..." )
211+ xml_element = self .generate_xml ()
212+ xml_content = self .format_xml (xml_element )
213+
214+ # Write to file
215+ output_path = self .base_dir / output_file
216+ with open (output_path , 'w' , encoding = 'utf-8' ) as f :
217+ f .write (xml_content )
218+
219+ print (f"✅ Sitemap generated successfully: { output_path } " )
220+ print (f"📊 Total URLs: { len (self .urls )} " )
221+
222+ return output_path
223+
224+
225+ def main ():
226+ """Main function."""
227+ print ("🌐 Sitemap Generator for maifeeulasad.github.io" )
228+ print ("=" * 50 )
229+
230+ # Get the directory where this script is located
231+ script_dir = Path (__file__ ).parent
232+
233+ # Initialize generator
234+ generator = SitemapGenerator (base_dir = script_dir )
235+
236+ # Generate sitemap
237+ try :
238+ sitemap_path = generator .generate_sitemap ()
239+ print (f"\n 🎉 Done! Sitemap saved to: { sitemap_path } " )
240+
241+ # Show file size
242+ file_size = sitemap_path .stat ().st_size
243+ print (f"📁 File size: { file_size :,} bytes" )
244+
245+ except Exception as e :
246+ print (f"❌ Error generating sitemap: { e } " )
247+ sys .exit (1 )
248+
249+
250+ if __name__ == "__main__" :
251+ main ()
0 commit comments