From 9809bc75f4b725cd2a5c9066c4f9f60385b19487 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 08:43:43 -0400 Subject: [PATCH 01/39] refactoring to run all setuid file access on user-specific worker processes (akin to JupyterHub) --- fileglancer/filestore.py | 29 + fileglancer/server.py | 665 +++++++++++------------ fileglancer/settings.py | 4 + fileglancer/user_worker.py | 1025 ++++++++++++++++++++++++++++++++++++ fileglancer/worker_pool.py | 318 +++++++++++ 5 files changed, 1681 insertions(+), 360 deletions(-) create mode 100644 fileglancer/user_worker.py create mode 100644 fileglancer/worker_pool.py diff --git a/fileglancer/filestore.py b/fileglancer/filestore.py index ce3417d6..484ff74a 100644 --- a/fileglancer/filestore.py +++ b/fileglancer/filestore.py @@ -658,6 +658,35 @@ def stream_file_range(self, path: str = None, start: int = 0, end: int = 0, buff file_handle.close() + @staticmethod + def _stream_contents(file_handle, buffer_size: int = DEFAULT_BUFFER_SIZE) -> Generator[bytes, None, None]: + """Stream from an open file handle. Handle is closed when done.""" + try: + while True: + chunk = file_handle.read(buffer_size) + if not chunk: + break + yield chunk + finally: + file_handle.close() + + @staticmethod + def _stream_range(start: int, end: int, content_length: int, + file_handle, buffer_size: int = DEFAULT_BUFFER_SIZE) -> Generator[bytes, None, None]: + """Stream a byte range from an open file handle. Handle is closed when done.""" + try: + file_handle.seek(start) + remaining = content_length + while remaining > 0: + chunk_size = min(buffer_size, remaining) + chunk = file_handle.read(chunk_size) + if not chunk: + break + yield chunk + remaining -= len(chunk) + finally: + file_handle.close() + def rename_file_or_dir(self, old_path: str, new_path: str): """ Rename a file at the given old path to the new path. diff --git a/fileglancer/server.py b/fileglancer/server.py index 365a0b75..3f812223 100644 --- a/fileglancer/server.py +++ b/fileglancer/server.py @@ -38,6 +38,7 @@ from fileglancer.user_context import UserContext, EffectiveUserContext, CurrentUserContext, UserContextConfigurationError from fileglancer.filestore import Filestore, RootCheckError from fileglancer.log import AccessLogMiddleware +from fileglancer.worker_pool import WorkerPool, WorkerError, WorkerDead from fileglancer import sshkeys from x2s3.utils import get_read_access_acl, get_nosuchbucket_response, get_error_response @@ -214,17 +215,47 @@ def create_app(settings): # Define ui_dir for serving static files and SPA ui_dir = PathLib(__file__).parent / "ui" + # Per-user persistent worker pool (only used when use_access_flags=True) + worker_pool = WorkerPool(settings) if settings.use_access_flags else None + def _get_user_context(username: str) -> UserContext: if settings.use_access_flags: return EffectiveUserContext(username) else: return CurrentUserContext() + async def _worker_exec(username: str, action: str, **kwargs): + """Dispatch an action to the per-user worker and return the result. - def _get_file_proxy_client(sharing_key: str, captured_path: str) -> Tuple[FileProxyClient | Response, UserContext | None, str]: - """Resolve a sharing key and captured path to a FileProxyClient. + When use_access_flags=True, dispatches to the persistent worker pool. + When use_access_flags=False (dev/test mode), runs the action directly + in the current process since no identity switching is needed. - Returns (client, user_context, subpath) on success, or (error_response, None, "") on failure. + Raises HTTPException on worker-level errors or dead workers. + """ + if worker_pool is not None: + try: + worker = await worker_pool.get_worker(username) + return await worker.execute(action, **kwargs) + except WorkerDead as e: + logger.error(f"Worker dead for {username}: {e}") + raise HTTPException(status_code=503, detail="Service temporarily unavailable") + except WorkerError as e: + raise # Let caller handle application-level errors + else: + # Dev/test mode: run action directly in-process + from fileglancer.user_worker import _ACTIONS, WorkerContext + handler = _ACTIONS.get(action) + if handler is None: + raise HTTPException(status_code=500, detail=f"Unknown action: {action}") + ctx = WorkerContext(username=username, db_url=settings.db_url) + request = {"action": action, **kwargs} + return handler(request, ctx) + + def _resolve_proxy_info(sharing_key: str, captured_path: str) -> Tuple[dict | Response, str]: + """Resolve a sharing key to proxy info (mount_path, target_name, username, subpath). + + Returns (info_dict, subpath) on success, or (error_response, "") on failure. """ def try_strip_prefix(captured: str, prefix: str) -> str | None: if captured == prefix: @@ -237,27 +268,25 @@ def try_strip_prefix(captured: str, prefix: str) -> str | None: proxied_path = db.get_proxied_path_by_sharing_key(session, sharing_key) if not proxied_path: - return get_nosuchbucket_response(captured_path), None, "" + return get_nosuchbucket_response(captured_path), "" - # Match captured_path against the stored url_prefix. - # The unquote() fallback handles clients like Vol-E viewer that send URLs - # with literal % characters instead of proper URL encoding — FastAPI - # auto-decodes path params, so we need to match the decoded form too. subpath = try_strip_prefix(captured_path, proxied_path.url_prefix) if subpath is None: subpath = try_strip_prefix(captured_path, unquote(proxied_path.url_prefix)) if subpath is None: - return get_error_response(404, "NoSuchKey", f"Path mismatch for sharing key {sharing_key}", captured_path), None, "" + return get_error_response(404, "NoSuchKey", f"Path mismatch for sharing key {sharing_key}", captured_path), "" fsp = db.get_file_share_path(session, proxied_path.fsp_name) if not fsp: - return get_error_response(400, "InvalidArgument", f"File share path {proxied_path.fsp_name} not found", captured_path), None, "" - # Expand ~ to user's home directory before constructing the mount path + return get_error_response(400, "InvalidArgument", f"File share path {proxied_path.fsp_name} not found", captured_path), "" expanded_mount_path = os.path.expanduser(fsp.mount_path) mount_path = f"{expanded_mount_path}/{proxied_path.path}" target_name = captured_path.rsplit('/', 1)[-1] if captured_path else os.path.basename(proxied_path.path) - # Use 256KB buffer for better performance on network filesystems - return FileProxyClient(proxy_kwargs={'target_name': target_name}, path=mount_path, buffer_size=256*1024), _get_user_context(proxied_path.username), subpath + return { + "mount_path": mount_path, + "target_name": target_name, + "username": proxied_path.username, + }, subpath @asynccontextmanager @@ -352,6 +381,11 @@ def mask_password(url: str) -> str: else: logger.debug(f"No notifications file found at {notifications_file}") + # Start worker pool eviction loop (only when using access flags) + if worker_pool is not None: + await worker_pool.start_eviction_loop() + logger.info("Worker pool started") + # Start cluster job monitor try: await apps_module.start_job_monitor() @@ -368,6 +402,14 @@ def mask_password(url: str) -> str: except Exception as e: logger.warning(f"Error stopping cluster job monitor: {e}") + # Cleanup: shut down all workers + if worker_pool is not None: + try: + await worker_pool.shutdown_all() + logger.info("Worker pool shut down") + except Exception as e: + logger.warning(f"Error shutting down worker pool: {e}") + app = FastAPI(lifespan=lifespan) # Add custom access log middleware @@ -927,14 +969,18 @@ async def create_proxied_path(fsp_name: str = Query(..., description="The name o _validate_url_prefix(url_prefix) sharing_name = url_prefix logger.info(f"Creating proxied path for {username} with sharing name {sharing_name} and fsp_name {fsp_name} and path {path} (url_prefix={url_prefix})") + # Validate the user can access the path via worker + validation = await _worker_exec(username, "validate_proxied_path", fsp_name=fsp_name, path=path) + if "error" in validation: + raise HTTPException(status_code=400, detail=validation["error"]) + with db.get_db_session(settings.db_url) as session: - with _get_user_context(username): # Necessary to validate the user can access the proxied path - try: - new_path = db.create_proxied_path(session, username, sharing_name, fsp_name, path, url_prefix=url_prefix) - return _convert_proxied_path(new_path, settings.external_proxy_url) - except ValueError as e: - logger.error(f"Error creating proxied path: {e}") - raise HTTPException(status_code=400, detail=str(e)) + try: + new_path = db.create_proxied_path(session, username, sharing_name, fsp_name, path, url_prefix=url_prefix) + return _convert_proxied_path(new_path, settings.external_proxy_url) + except ValueError as e: + logger.error(f"Error creating proxied path: {e}") + raise HTTPException(status_code=400, detail=str(e)) @app.get("/api/proxied-path", response_model=ProxiedPathResponse, @@ -969,14 +1015,25 @@ async def update_proxied_path(sharing_key: str = Path(..., description="The shar path: Optional[str] = Query(default=None, description="The path relative to the file share path mount point"), sharing_name: Optional[str] = Query(default=None, description="The sharing path of the proxied path"), username: str = Depends(get_current_user)): + # If path or fsp_name is changing, validate access via worker + if path is not None or fsp_name is not None: + with db.get_db_session(settings.db_url) as session: + existing = db.get_proxied_path_by_sharing_key(session, sharing_key) + if existing: + validate_fsp = fsp_name or existing.fsp_name + validate_path = path or existing.path + validation = await _worker_exec(username, "validate_proxied_path", + fsp_name=validate_fsp, path=validate_path) + if "error" in validation: + raise HTTPException(status_code=400, detail=validation["error"]) + with db.get_db_session(settings.db_url) as session: - with _get_user_context(username): # Necessary to validate the user can access the proxied path - try: - updated = db.update_proxied_path(session, username, sharing_key, new_path=path, new_sharing_name=sharing_name, new_fsp_name=fsp_name) - return _convert_proxied_path(updated, settings.external_proxy_url) - except ValueError as e: - logger.error(f"Error updating proxied path: {e}") - raise HTTPException(status_code=400, detail=str(e)) + try: + updated = db.update_proxied_path(session, username, sharing_key, new_path=path, new_sharing_name=sharing_name, new_fsp_name=fsp_name) + return _convert_proxied_path(updated, settings.external_proxy_url) + except ValueError as e: + logger.error(f"Error updating proxied path: {e}") + raise HTTPException(status_code=400, detail=str(e)) @app.delete("/api/proxied-path/{sharing_key}", description="Delete a proxied path by sharing key") @@ -1060,41 +1117,81 @@ async def target_dispatcher(request: Request, if 'acl' in request.query_params: return get_read_access_acl() - client, ctx, subpath = _get_file_proxy_client(sharing_key, path) - if isinstance(client, Response): - return client + info, subpath = _resolve_proxy_info(sharing_key, path) + if isinstance(info, Response): + return info if list_type: if list_type == 2: - with ctx: - return await client.list_objects_v2(continuation_token, delimiter, \ - encoding_type, fetch_owner, max_keys, prefix, start_after) + result = await _worker_exec(info["username"], "s3_list_objects", + mount_path=info["mount_path"], + target_name=info["target_name"], + continuation_token=continuation_token, + delimiter=delimiter, + encoding_type=encoding_type, + fetch_owner=fetch_owner, + max_keys=max_keys, + prefix=prefix, + start_after=start_after) + return Response(content=result["body"], media_type=result.get("media_type", "application/xml"), + status_code=result.get("status_code", 200)) else: return get_error_response(400, "InvalidArgument", f"Invalid list type {list_type}", path) else: range_header = request.headers.get("range") - # Open file in user context, then immediately exit - # The file descriptor retains access rights after we switch back to root - with ctx: - handle = await client.open_object(subpath, range_header) - - # Context exited! Now stream without holding the lock - if isinstance(handle, ObjectHandle): - return client.stream_object(handle) + result = await _worker_exec(info["username"], "s3_open_object", + mount_path=info["mount_path"], + target_name=info["target_name"], + path=subpath, + range_header=range_header) + + if result.get("type") == "handle": + # Worker validated access and returned file metadata + # Open the file in main process (root can read anything) + resolved_path = result["resolved_path"] + if resolved_path is None: + return get_error_response(404, "NoSuchKey", "File not found", subpath) + + file_handle = open(resolved_path, "rb") + from x2s3.client_file import FileObjectHandle, file_iterator + handle = FileObjectHandle( + target_name=result["target_name"], + key=result["key"], + status_code=result["status_code"], + headers=result["headers"], + media_type=result.get("media_type"), + content_length=result["content_length"], + file_handle=file_handle, + start=result["start"], + end=result["end"], + ) + return StreamingResponse( + file_iterator(handle, 256 * 1024), + status_code=handle.status_code, + headers=handle.headers, + media_type=handle.media_type, + ) else: - # Error response (e.g., file not found, invalid range) - return handle + # Error response + return Response( + content=result.get("body", ""), + status_code=result.get("status_code", 500), + headers=result.get("headers", {}), + ) @app.head("/files/{sharing_key}/{path:path}") async def head_object(sharing_key: str, path: str = ''): try: - client, ctx, subpath = _get_file_proxy_client(sharing_key, path) - if isinstance(client, Response): - return client - with ctx: - return await client.head_object(subpath) + info, subpath = _resolve_proxy_info(sharing_key, path) + if isinstance(info, Response): + return info + result = await _worker_exec(info["username"], "s3_head_object", + mount_path=info["mount_path"], + target_name=info["target_name"], + path=subpath) + return Response(headers=result.get("headers", {}), status_code=result.get("status_code", 200)) except: logger.opt(exception=sys.exc_info()).info("Error requesting head") return get_error_response(500, "InternalError", "Error requesting HEAD", path) @@ -1130,59 +1227,18 @@ def _get_filestore(path_name: str): @app.get("/api/profile", description="Get the current user's profile") async def get_profile(username: str = Depends(get_current_user)): """Get the current user's profile""" - with _get_user_context(username): - - # Find matching file share path for home directory - with db.get_db_session(settings.db_url) as session: - paths = db.get_file_share_paths(session) - - # First, check if there's a "home" FSP (for ~/ paths) - home_fsp = next((fsp for fsp in paths if fsp.mount_path in ('~', '~/')), None) - if home_fsp: - home_directory_name = "." - else: - # If no "home" FSP exists, fall back to finding by mount path - home_directory_path = os.path.expanduser(f"~{username}") - home_parent = os.path.dirname(home_directory_path) - home_fsp = next((fsp for fsp in paths if fsp.mount_path == home_parent), None) - home_directory_name = os.path.basename(home_directory_path) - - home_fsp_name = home_fsp.name if home_fsp else None - - # Get user groups - user_groups = [] - try: - user_info = pwd.getpwnam(username) - all_groups = grp.getgrall() - for group in all_groups: - if username in group.gr_mem: - user_groups.append(group.gr_name) - primary_group = grp.getgrgid(user_info.pw_gid).gr_name - if primary_group not in user_groups: - user_groups.append(primary_group) - except Exception as e: - logger.error(f"Error getting groups for user {username}: {str(e)}") - - return { - "username": username, - "homeFileSharePathName": home_fsp_name, - "homeDirectoryName": home_directory_name, - "groups": user_groups, - } + result = await _worker_exec(username, "get_profile") + return result # SSH Key Management endpoints @app.get("/api/ssh-keys", response_model=sshkeys.SSHKeyListResponse, description="List Fileglancer-managed SSH keys") async def list_ssh_keys(username: str = Depends(get_current_user)): """List SSH keys with 'fileglancer' in the comment from authorized_keys""" - with _get_user_context(username): - try: - ssh_dir = sshkeys.get_ssh_directory() - keys = sshkeys.list_ssh_keys(ssh_dir) - return sshkeys.SSHKeyListResponse(keys=keys) - except Exception as e: - logger.error(f"Error listing SSH keys for {username}: {e}") - raise HTTPException(status_code=500, detail=str(e)) + result = await _worker_exec(username, "list_ssh_keys") + if "error" in result: + raise HTTPException(status_code=result.get("status_code", 500), detail=result["error"]) + return sshkeys.SSHKeyListResponse(keys=[sshkeys.SSHKeyInfo(**k) for k in result["keys"]]) @app.post("/api/ssh-keys/generate-temp", description="Generate a temporary SSH key and return private key for one-time copy") @@ -1194,21 +1250,23 @@ async def generate_temp_ssh_key( The private key is streamed securely and the temporary files are deleted after the response is sent. Key info is included in response headers: - - X-SSH-Key-Filename - - X-SSH-Key-Type - X-SSH-Key-Fingerprint - X-SSH-Key-Comment """ - with _get_user_context(username): - try: - ssh_dir = sshkeys.get_ssh_directory() - return sshkeys.generate_temp_key_and_authorize(ssh_dir, request.passphrase) - - except RuntimeError as e: - raise HTTPException(status_code=500, detail=str(e)) - except Exception as e: - logger.error(f"Error generating temp SSH key for {username}: {e}") - raise HTTPException(status_code=500, detail=str(e)) + result = await _worker_exec(username, "generate_ssh_key", passphrase=request.passphrase) + if "error" in result: + raise HTTPException(status_code=result.get("status_code", 500), detail=result["error"]) + # Reconstruct the response with headers + headers = {} + if result.get("fingerprint"): + headers["X-SSH-Key-Fingerprint"] = result["fingerprint"] + if result.get("comment"): + headers["X-SSH-Key-Comment"] = result["comment"] + return Response( + content=result["private_key"], + media_type="application/x-pem-file", + headers=headers, + ) # File content endpoint @app.head("/api/content/{path_name:path}") @@ -1222,40 +1280,32 @@ async def head_file_content(path_name: str, else: filestore_name, _, subpath = path_name.partition('/') - with _get_user_context(username): - filestore, error = _get_filestore(filestore_name) - if filestore is None: - raise HTTPException(status_code=404 if "not found" in error else 500, detail=error) - - file_name = subpath.split('/')[-1] if subpath else '' - content_type = guess_content_type(file_name) - - try: - file_info = filestore.get_file_info(subpath) - - is_binary = filestore.check_is_binary(subpath) - - headers = { - 'Accept-Ranges': 'bytes', - 'X-Is-Binary': 'true' if is_binary else 'false', - } - - if content_type == 'application/octet-stream' and file_name: - headers['Content-Disposition'] = f'attachment; filename="{file_name}"' - - if hasattr(file_info, 'size') and file_info.size is not None: - headers['Content-Length'] = str(file_info.size) - - if hasattr(file_info, 'last_modified') and file_info.last_modified is not None: - headers['Last-Modified'] = format_timestamp(file_info.last_modified) - - return Response(status_code=200, headers=headers, media_type=content_type) + result = await _worker_exec(username, "head_file", fsp_name=filestore_name, subpath=subpath) + if result.get("redirect"): + redirect_url = f"/api/content/{result['fsp_name']}" + if result.get("subpath"): + redirect_url += f"?subpath={result['subpath']}" + return RedirectResponse(url=redirect_url, status_code=307) + if "error" in result: + raise HTTPException(status_code=result.get("status_code", 500), detail=result["error"]) + + info = result["info"] + file_name = subpath.split('/')[-1] if subpath else '' + content_type = result["content_type"] + is_binary = result["is_binary"] + + headers = { + 'Accept-Ranges': 'bytes', + 'X-Is-Binary': 'true' if is_binary else 'false', + } + if content_type == 'application/octet-stream' and file_name: + headers['Content-Disposition'] = f'attachment; filename="{file_name}"' + if info.get("size") is not None: + headers['Content-Length'] = str(info["size"]) + if info.get("last_modified") is not None: + headers['Last-Modified'] = format_timestamp(info["last_modified"]) - except FileNotFoundError: - logger.warning(f"File not found in {filestore_name}: {subpath}") - raise HTTPException(status_code=404, detail="File not found") - except PermissionError: - raise HTTPException(status_code=403, detail="Permission denied") + return Response(status_code=200, headers=headers, media_type=content_type) @app.get("/api/content/{path_name:path}") @@ -1267,59 +1317,25 @@ async def get_file_content(request: Request, path_name: str, subpath: Optional[s else: filestore_name, _, subpath = path_name.partition('/') - # Open file with user's permissions, then immediately release the context - # The file descriptor retains the access rights after we switch back to root - with _get_user_context(username): - filestore, error = _get_filestore(filestore_name) - if filestore is None: - raise HTTPException(status_code=404 if "not found" in error else 500, detail=error) + # Worker validates path and returns metadata (runs as user) + result = await _worker_exec(username, "open_file", fsp_name=filestore_name, subpath=subpath) - file_name = subpath.split('/')[-1] if subpath else '' - content_type = guess_content_type(file_name) + if result.get("redirect"): + redirect_url = f"/api/content/{result['fsp_name']}" + if result.get("subpath"): + redirect_url += f"?subpath={result['subpath']}" + return RedirectResponse(url=redirect_url, status_code=307) + if "error" in result: + raise HTTPException(status_code=result.get("status_code", 500), detail=result["error"]) - try: - file_info = filestore.get_file_info(subpath) - if file_info.is_dir: - raise HTTPException(status_code=400, detail="Cannot download directory content") - - file_size = file_info.size - - # Open the file while we have user's permissions - full_path = filestore._check_path_in_root(subpath) - file_handle = open(full_path, 'rb') - - except RootCheckError as e: - # Path attempts to escape root directory - try to find a valid fsp for this absolute path - logger.info(f"RootCheckError caught for {filestore_name}/{subpath}: {e}") - - # Use the full_path from the exception - full_path = e.full_path - - with db.get_db_session(settings.db_url) as session: - match = db.find_fsp_from_absolute_path(session, full_path) + full_path = result["full_path"] + file_size = result["file_size"] + content_type = result["content_type"] + file_name = subpath.split('/')[-1] if subpath else '' - if match: - fsp, relative_subpath = match - # Construct the correct URL - if relative_subpath: - redirect_url = f"/api/content/{fsp.name}?subpath={relative_subpath}" - else: - redirect_url = f"/api/content/{fsp.name}" - - logger.info(f"Redirecting from /api/content/{filestore_name}?subpath={subpath} to {redirect_url}") - return RedirectResponse(url=redirect_url, status_code=307) - - # If no match found, return the original error message - logger.error(f"No valid file share found for path: {full_path}") - raise HTTPException(status_code=400, detail=str(e)) - except FileNotFoundError: - logger.error(f"File not found in {filestore_name}: {subpath}") - raise HTTPException(status_code=404, detail="File or directory not found") - except PermissionError: - raise HTTPException(status_code=403, detail="Permission denied") - - # Context exited! We're back to root, but file_handle retains user's access rights - # Now we can stream the file asynchronously without holding the user context lock + # Open file in main process — the worker validated access; + # main process runs as root so it can open the validated path + file_handle = open(full_path, 'rb') range_header = request.headers.get('Range') @@ -1344,8 +1360,10 @@ async def get_file_content(request: Request, path_name: str, subpath: Optional[s if content_type == 'application/octet-stream' and file_name: headers['Content-Disposition'] = f'attachment; filename="{file_name}"' + # Construct a temporary filestore just for streaming + # (stream_file_range only needs the file_handle) return StreamingResponse( - filestore.stream_file_range(start=start, end=end, file_handle=file_handle), + Filestore._stream_range(start=start, end=end, content_length=content_length, file_handle=file_handle), status_code=206, headers=headers, media_type=content_type @@ -1360,7 +1378,7 @@ async def get_file_content(request: Request, path_name: str, subpath: Optional[s headers['Content-Disposition'] = f'attachment; filename="{file_name}"' return StreamingResponse( - filestore.stream_file_contents(file_handle=file_handle), + Filestore._stream_contents(file_handle=file_handle), status_code=200, headers=headers, media_type=content_type @@ -1379,73 +1397,27 @@ async def get_file_metadata(path_name: str, subpath: Optional[str] = Query(''), else: filestore_name, _, subpath = path_name.partition('/') - with _get_user_context(username): - filestore, error = _get_filestore(filestore_name) - if filestore is None: - raise HTTPException(status_code=404 if "not found" in error else 500, detail=error) - - try: - with db.get_db_session(settings.db_url) as session: - file_info = filestore.get_file_info(subpath, current_user=username, session=session) - logger.trace(f"File info: {file_info}") - - result = {"info": json.loads(file_info.model_dump_json())} - - if file_info.is_dir: - try: - if limit is not None: - files, has_more, next_cursor, total_count = filestore.yield_file_infos_paginated( - subpath, current_user=username, session=session, - limit=limit, cursor=cursor - ) - result["files"] = [json.loads(f.model_dump_json()) for f in files] - result["has_more"] = has_more - result["next_cursor"] = next_cursor - result["total_count"] = total_count - else: - files = list(filestore.yield_file_infos(subpath, current_user=username, session=session)) - result["files"] = [json.loads(f.model_dump_json()) for f in files] - except PermissionError: - logger.error(f"Permission denied when listing files in directory: {subpath}") - result["files"] = [] - result["error"] = "Permission denied when listing directory contents" - return JSONResponse(content=result, status_code=403) - except FileNotFoundError: - logger.error(f"Directory not found during listing: {subpath}") - result["files"] = [] - result["error"] = "Directory contents not found" - return JSONResponse(content=result, status_code=404) - - return result - - except RootCheckError as e: - # Path attempts to escape root directory - try to find a valid fsp for this absolute path - logger.info(f"RootCheckError caught for {filestore_name}/{subpath}: {e}") - - full_path = e.full_path - - with db.get_db_session(settings.db_url) as session: - match = db.find_fsp_from_absolute_path(session, full_path) - - if match: - fsp, relative_subpath = match - # Construct the correct URL - if relative_subpath: - redirect_url = f"/api/files/{fsp.name}?subpath={relative_subpath}" - else: - redirect_url = f"/api/files/{fsp.name}" - - logger.info(f"Redirecting from /api/files/{filestore_name}?subpath={subpath} to {redirect_url}") - return RedirectResponse(url=redirect_url, status_code=307) - - # If no match found, return the original error message - logger.error(f"No valid file share found for path: {full_path}") - raise HTTPException(status_code=400, detail=str(e)) - except FileNotFoundError: - logger.error(f"File or directory not found: {subpath}") - raise HTTPException(status_code=404, detail="File or directory not found") - except PermissionError: - raise HTTPException(status_code=403, detail="Permission denied") + if limit is not None: + result = await _worker_exec(username, "list_dir_paged", + fsp_name=filestore_name, subpath=subpath, + limit=limit, cursor=cursor) + else: + result = await _worker_exec(username, "list_dir", + fsp_name=filestore_name, subpath=subpath) + + if result.get("redirect"): + redirect_url = f"/api/files/{result['fsp_name']}" + if result.get("subpath"): + redirect_url += f"?subpath={result['subpath']}" + return RedirectResponse(url=redirect_url, status_code=307) + if "error" in result and "status_code" in result: + status_code = result["status_code"] + if status_code == 403 or status_code == 404: + return JSONResponse(content=result, status_code=status_code) + raise HTTPException(status_code=status_code, detail=result["error"]) + if "error" in result: + raise HTTPException(status_code=500, detail=result["error"]) + return result @app.post("/api/files/{path_name}") @@ -1474,30 +1446,19 @@ async def create_file_or_dir(path_name: str, # Use the validated and sanitized path for all operations validated_subpath = normalized_path - with _get_user_context(username): - filestore, error = _get_filestore(path_name) - if filestore is None: - raise HTTPException(status_code=404 if "not found" in error else 500, detail=error) - - try: - file_type = body.get("type") - if file_type == "directory": - logger.info(f"User {username} creating directory {path_name}/{validated_subpath}") - # Path is validated above - safe to use in filesystem operation - filestore.create_dir(validated_subpath) - elif file_type == "file": - logger.info(f"User {username} creating file {path_name}/{validated_subpath}") - # Path is validated above - safe to use in filesystem operation - filestore.create_empty_file(validated_subpath) - else: - raise HTTPException(status_code=400, detail="Invalid file type") - - except FileExistsError: - raise HTTPException(status_code=409, detail="A file or directory with this name already exists") - except PermissionError as e: - raise HTTPException(status_code=403, detail=str(e)) + file_type = body.get("type") + if file_type == "directory": + logger.info(f"User {username} creating directory {path_name}/{validated_subpath}") + result = await _worker_exec(username, "create_dir", fsp_name=path_name, subpath=validated_subpath) + elif file_type == "file": + logger.info(f"User {username} creating file {path_name}/{validated_subpath}") + result = await _worker_exec(username, "create_file", fsp_name=path_name, subpath=validated_subpath) + else: + raise HTTPException(status_code=400, detail="Invalid file type") - return JSONResponse(status_code=201, content={"message": "Item created"}) + if "error" in result: + raise HTTPException(status_code=result.get("status_code", 500), detail=result["error"]) + return JSONResponse(status_code=201, content={"message": "Item created"}) @app.patch("/api/files/{path_name}") @@ -1506,47 +1467,26 @@ async def update_file_or_dir(path_name: str, body: Dict = Body(...), username: str = Depends(get_current_user)): """Handle PATCH requests to rename or update file permissions""" - with _get_user_context(username): - filestore, error = _get_filestore(path_name) - if filestore is None: - raise HTTPException(status_code=404 if "not found" in error else 500, detail=error) - old_file_info = filestore.get_file_info(subpath, username) - new_path = body.get("path") - new_permissions = body.get("permissions") - - # Validate and sanitize new_path if renaming - validated_new_path = new_path - if new_path is not None and new_path != old_file_info.path: - # Normalize the path to prevent path traversal - normalized_new_path = os.path.normpath(new_path) - - # Security check: Ensure normalized path doesn't escape directory - if normalized_new_path.startswith('..') or os.path.isabs(normalized_new_path): - raise HTTPException(status_code=400, detail="New path cannot escape the current directory") - - # Validate the filename portion for invalid characters - new_filename = os.path.basename(normalized_new_path) - _validate_filename(new_filename) - - # Use the validated path - validated_new_path = normalized_new_path - - try: - if new_permissions is not None and new_permissions != old_file_info.permissions: - logger.info(f"User {username} changing permissions of {old_file_info.absolute_path} to {new_permissions}") - filestore.change_file_permissions(subpath, new_permissions) - - if new_path is not None and new_path != old_file_info.path: - logger.info(f"User {username} renaming {old_file_info.absolute_path} to {validated_new_path}") - # Path is validated above - safe to use in filesystem operation - filestore.rename_file_or_dir(old_file_info.path, validated_new_path) - - except PermissionError as e: - raise HTTPException(status_code=403, detail=str(e)) - except OSError as e: - raise HTTPException(status_code=500, detail=str(e)) - - return JSONResponse(status_code=200, content={"message": "Permissions changed"}) + new_path = body.get("path") + new_permissions = body.get("permissions") + + # Validate and sanitize new_path if renaming + validated_new_path = new_path + if new_path is not None: + normalized_new_path = os.path.normpath(new_path) + if normalized_new_path.startswith('..') or os.path.isabs(normalized_new_path): + raise HTTPException(status_code=400, detail="New path cannot escape the current directory") + new_filename = os.path.basename(normalized_new_path) + _validate_filename(new_filename) + validated_new_path = normalized_new_path + + result = await _worker_exec(username, "update_file", + fsp_name=path_name, subpath=subpath, + new_path=validated_new_path, + new_permissions=new_permissions) + if "error" in result: + raise HTTPException(status_code=result.get("status_code", 500), detail=result["error"]) + return JSONResponse(status_code=200, content={"message": "Permissions changed"}) @app.delete("/api/files/{fsp_name}") @@ -1554,18 +1494,11 @@ async def delete_file_or_dir(fsp_name: str, subpath: Optional[str] = Query(''), username: str = Depends(get_current_user)): """Handle DELETE requests to remove a file or (empty) directory""" - with _get_user_context(username): - filestore, error = _get_filestore(fsp_name) - if filestore is None: - raise HTTPException(status_code=404 if "not found" in error else 500, detail=error) - - try: - logger.info(f"User {username} deleting {filestore.get_root_path()}/{subpath}") - filestore.remove_file_or_dir(subpath) - except PermissionError as e: - raise HTTPException(status_code=403, detail=str(e)) - - return JSONResponse(status_code=200, content={"message": "Item deleted"}) + logger.info(f"User {username} deleting {fsp_name}/{subpath}") + result = await _worker_exec(username, "delete", fsp_name=fsp_name, subpath=subpath) + if "error" in result: + raise HTTPException(status_code=result.get("status_code", 500), detail=result["error"]) + return JSONResponse(status_code=200, content={"message": "Item deleted"}) # --- Apps & Jobs API --- @@ -1751,14 +1684,8 @@ async def update_user_app(body: ManifestFetchRequest, description="Validate file/directory paths for app parameters") async def validate_paths(body: PathValidationRequest, username: str = Depends(get_current_user)): - errors = {} - with _get_user_context(username): - with db.get_db_session(settings.db_url) as session: - for param_key, path_value in body.paths.items(): - error = apps_module.validate_path_in_filestore(path_value, session) - if error: - errors[param_key] = error - return PathValidationResponse(errors=errors) + result = await _worker_exec(username, "validate_paths", paths=body.paths) + return PathValidationResponse(errors=result.get("errors", {})) @app.get("/api/cluster-defaults", description="Get cluster configuration defaults") @@ -1791,8 +1718,7 @@ async def submit_job(body: JobSubmitRequest, container=body.container, container_args=body.container_args, ) - with _get_user_context(username): - return _convert_job(db_job) + return _convert_job(db_job) except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) except Exception as e: @@ -1805,8 +1731,17 @@ async def get_jobs(status: Optional[str] = Query(None, description="Filter by st username: str = Depends(get_current_user)): with db.get_db_session(settings.db_url) as session: db_jobs = db.get_jobs_by_username(session, username, status) - with _get_user_context(username): - jobs = [_convert_job(j) for j in db_jobs] + # For listing, read service_url for running service jobs via worker + jobs = [] + for j in db_jobs: + service_url = None + if getattr(j, 'entry_point_type', 'job') == 'service' and j.status == 'RUNNING': + try: + result = await _worker_exec(username, "get_service_url", job_id=j.id) + service_url = result.get("service_url") + except Exception: + pass + jobs.append(_convert_job(j, service_url=service_url)) return JobResponse(jobs=jobs) @app.get("/api/jobs/{job_id}", response_model=Job, @@ -1817,8 +1752,16 @@ async def get_job(job_id: int, db_job = db.get_job(session, job_id, username) if db_job is None: raise HTTPException(status_code=404, detail="Job not found") - with _get_user_context(username): - return _convert_job(db_job, include_files=True) + # Read file paths and service URL via worker + files_result = await _worker_exec(username, "get_job_file_paths", job_id=job_id) + service_url = None + if getattr(db_job, 'entry_point_type', 'job') == 'service' and db_job.status == 'RUNNING': + try: + svc_result = await _worker_exec(username, "get_service_url", job_id=job_id) + service_url = svc_result.get("service_url") + except Exception: + pass + return _convert_job(db_job, service_url=service_url, files=files_result.get("files")) @app.post("/api/jobs/{job_id}/cancel", description="Cancel a running job") @@ -1826,8 +1769,7 @@ async def cancel_job(job_id: int, username: str = Depends(get_current_user)): try: db_job = await apps_module.cancel_job(job_id, username) - with _get_user_context(username): - return _convert_job(db_job) + return _convert_job(db_job) except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) @@ -1849,12 +1791,14 @@ async def get_job_file(job_id: int, if file_type not in ("script", "stdout", "stderr"): raise HTTPException(status_code=400, detail="file_type must be script, stdout, or stderr") try: - with _get_user_context(username): - content = await apps_module.get_job_file_content(job_id, username, file_type) + result = await _worker_exec(username, "get_job_file", job_id=job_id, file_type=file_type) + if "error" in result: + raise HTTPException(status_code=result.get("status_code", 404), detail=result["error"]) + content = result.get("content") if content is None: raise HTTPException(status_code=404, detail=f"File not found: {file_type}") return PlainTextResponse(content) - except ValueError as e: + except WorkerError as e: raise HTTPException(status_code=404, detail=str(e)) def _ensure_utc(dt: Optional[datetime]) -> Optional[datetime]: @@ -1870,11 +1814,12 @@ def _ensure_utc(dt: Optional[datetime]) -> Optional[datetime]: return dt.replace(tzinfo=UTC) return dt - def _convert_job(db_job: db.JobDB, include_files: bool = False) -> Job: - """Convert a database JobDB to a Pydantic Job model.""" - files = None - if include_files: - files = apps_module.get_job_file_paths(db_job) + def _convert_job(db_job: db.JobDB, service_url: str = None, files: dict = None) -> Job: + """Convert a database JobDB to a Pydantic Job model. + + File-reading fields (service_url, files) must be passed in pre-computed + by the caller, since they require user-context file I/O. + """ return Job( id=db_job.id, app_url=db_job.app_url, @@ -1894,7 +1839,7 @@ def _convert_job(db_job: db.JobDB, include_files: bool = False) -> Job: container_args=db_job.container_args, pull_latest=db_job.pull_latest, cluster_job_id=db_job.cluster_job_id, - service_url=apps_module.get_service_url(db_job), + service_url=service_url, created_at=_ensure_utc(db_job.created_at), started_at=_ensure_utc(db_job.started_at), finished_at=_ensure_utc(db_job.finished_at), diff --git a/fileglancer/settings.py b/fileglancer/settings.py index 3b21bdef..8b597789 100644 --- a/fileglancer/settings.py +++ b/fileglancer/settings.py @@ -100,6 +100,10 @@ class Settings(BaseSettings): # Useful for setting up scheduler env (e.g., /misc/lsf/conf/profile.lsf). env_source_script: Optional[str] = None + # Worker pool settings + worker_pool_max_workers: int = 50 + worker_pool_idle_timeout: int = 300 # seconds + # Cluster / Apps settings (mirrors py-cluster-api ClusterConfig) cluster: ClusterSettings = ClusterSettings() diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py new file mode 100644 index 00000000..0777085b --- /dev/null +++ b/fileglancer/user_worker.py @@ -0,0 +1,1025 @@ +"""Persistent per-user worker subprocess. + +This module is the entry point for long-lived worker subprocesses spawned by +WorkerPool. Each worker runs as a single user (identity set at fork time) +and handles all user-scoped operations: file I/O, cluster jobs, git ops, +SSH key management, etc. + +Protocol: + - IPC over a Unix socketpair (fd passed via FGC_WORKER_FD env var) + - Messages are length-prefixed JSON: 4-byte big-endian length + JSON body + - Worker reads requests, dispatches to action handlers, writes responses + - {"action": "shutdown"} triggers a clean exit + +The worker runs a synchronous request/response loop. Cluster operations +that use py-cluster-api's async API are run via _run_async() per-request. + +In dev/test mode (use_access_flags=False), action handlers are called +directly in-process from server.py, so _run_async() must handle being +called from within an existing event loop. +""" + +from __future__ import annotations + +import asyncio +import ctypes +import ctypes.util +import json +import logging +import os +import pwd +import socket +import struct +import sys +from pathlib import Path +from typing import Any, Optional + +from loguru import logger + + +# Length-prefix format: 4-byte big-endian unsigned int +_HEADER_FMT = "!I" +_HEADER_SIZE = struct.calcsize(_HEADER_FMT) + + +def _run_async(coro): + """Run an async coroutine, handling both subprocess and in-process contexts. + + In subprocess mode (no running event loop), uses asyncio.run(). + In dev/test mode (called from within FastAPI's event loop), uses + a new event loop in a thread to avoid "cannot be called from a running loop". + """ + try: + asyncio.get_running_loop() + except RuntimeError: + # No running loop — we're in the subprocess + return asyncio.run(coro) + else: + # Inside an event loop (dev/test mode) — run in a thread + import concurrent.futures + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + future = pool.submit(asyncio.run, coro) + return future.result() + + +def _set_pdeathsig(): + """Ask the kernel to send SIGTERM when our parent process dies. + + This prevents orphan workers if the main process is killed without + a chance to clean up. Linux-only (PR_SET_PDEATHSIG = 1). + """ + try: + import signal + libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True) + PR_SET_PDEATHSIG = 1 + result = libc.prctl(PR_SET_PDEATHSIG, signal.SIGTERM, 0, 0, 0) + if result != 0: + logger.warning(f"prctl(PR_SET_PDEATHSIG) failed: errno={ctypes.get_errno()}") + except Exception as e: + logger.warning(f"Could not set PR_SET_PDEATHSIG: {e}") + + +def _send(sock: socket.socket, data: dict): + """Send a length-prefixed JSON message.""" + payload = json.dumps(data, default=str).encode() + header = struct.pack(_HEADER_FMT, len(payload)) + sock.sendall(header + payload) + + +def _recv(sock: socket.socket) -> dict: + """Receive a length-prefixed JSON message.""" + header = _recvall(sock, _HEADER_SIZE) + if header is None: + raise ConnectionError("Parent closed connection") + (length,) = struct.unpack(_HEADER_FMT, header) + payload = _recvall(sock, length) + if payload is None: + raise ConnectionError("Parent closed connection mid-message") + return json.loads(payload) + + +def _recvall(sock: socket.socket, n: int) -> Optional[bytes]: + """Read exactly n bytes from socket.""" + data = bytearray() + while len(data) < n: + chunk = sock.recv(n - len(data)) + if not chunk: + return None + data.extend(chunk) + return bytes(data) + + +# --------------------------------------------------------------------------- +# Action handlers — file operations +# --------------------------------------------------------------------------- + +def _get_filestore(fsp_name: str, db_url: str): + """Look up a FileSharePath and return a Filestore instance.""" + from fileglancer import database as db + from fileglancer.filestore import Filestore + + with db.get_db_session(db_url) as session: + fsp = db.get_file_share_path(session, fsp_name) + if fsp is None: + return None, f"File share path '{fsp_name}' not found" + + filestore = Filestore(fsp) + try: + filestore.get_file_info(None) + except FileNotFoundError: + return None, f"File share path '{fsp_name}' is not mounted" + + return filestore, None + + +def _action_list_dir(request: dict, ctx: WorkerContext) -> dict: + """List directory contents.""" + fsp_name = request["fsp_name"] + subpath = request.get("subpath", "") + current_user = ctx.username + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error, "status_code": 404 if "not found" in error else 500} + + from fileglancer import database as db + from fileglancer.filestore import RootCheckError + + try: + with db.get_db_session(ctx.db_url) as session: + file_info = filestore.get_file_info(subpath, current_user=current_user, session=session) + result = {"info": json.loads(file_info.model_dump_json())} + + if file_info.is_dir: + try: + files = list(filestore.yield_file_infos(subpath, current_user=current_user, session=session)) + result["files"] = [json.loads(f.model_dump_json()) for f in files] + except PermissionError: + result["files"] = [] + result["error"] = "Permission denied when listing directory contents" + result["status_code"] = 403 + except FileNotFoundError: + result["files"] = [] + result["error"] = "Directory contents not found" + result["status_code"] = 404 + + return result + except RootCheckError as e: + # Path escapes root — check if it belongs to another file share + with db.get_db_session(ctx.db_url) as session: + match = db.find_fsp_from_absolute_path(session, e.full_path) + if match: + fsp, relative_subpath = match + return {"redirect": True, "fsp_name": fsp.name, "subpath": relative_subpath or ""} + return {"error": str(e), "status_code": 400} + except FileNotFoundError: + return {"error": "File or directory not found", "status_code": 404} + except PermissionError: + return {"error": "Permission denied", "status_code": 403} + + +def _action_list_dir_paged(request: dict, ctx: WorkerContext) -> dict: + """List directory contents with pagination.""" + fsp_name = request["fsp_name"] + subpath = request.get("subpath", "") + current_user = ctx.username + limit = request.get("limit", 200) + cursor = request.get("cursor") + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error, "status_code": 404 if "not found" in error else 500} + + from fileglancer import database as db + from fileglancer.filestore import RootCheckError + + try: + with db.get_db_session(ctx.db_url) as session: + file_info = filestore.get_file_info(subpath, current_user=current_user, session=session) + result = {"info": json.loads(file_info.model_dump_json())} + + if file_info.is_dir: + try: + files, has_more, next_cursor, total_count = filestore.yield_file_infos_paginated( + subpath, current_user=current_user, session=session, + limit=limit, cursor=cursor + ) + result["files"] = [json.loads(f.model_dump_json()) for f in files] + result["has_more"] = has_more + result["next_cursor"] = next_cursor + result["total_count"] = total_count + except PermissionError: + result["files"] = [] + result["error"] = "Permission denied when listing directory contents" + result["status_code"] = 403 + except FileNotFoundError: + result["files"] = [] + result["error"] = "Directory contents not found" + result["status_code"] = 404 + + return result + except RootCheckError as e: + with db.get_db_session(ctx.db_url) as session: + match = db.find_fsp_from_absolute_path(session, e.full_path) + if match: + fsp, relative_subpath = match + return {"redirect": True, "fsp_name": fsp.name, "subpath": relative_subpath or ""} + return {"error": str(e), "status_code": 400} + except FileNotFoundError: + return {"error": "File or directory not found", "status_code": 404} + except PermissionError: + return {"error": "Permission denied", "status_code": 403} + + +def _action_get_file_info(request: dict, ctx: WorkerContext) -> dict: + """Get metadata for a single file or directory.""" + fsp_name = request["fsp_name"] + subpath = request.get("subpath", "") + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error} + + from fileglancer import database as db + + with db.get_db_session(ctx.db_url) as session: + file_info = filestore.get_file_info(subpath, current_user=ctx.username, session=session) + return {"info": json.loads(file_info.model_dump_json())} + + +def _action_check_binary(request: dict, ctx: WorkerContext) -> dict: + """Check if a file is binary.""" + fsp_name = request["fsp_name"] + subpath = request.get("subpath", "") + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error} + + is_binary = filestore.check_is_binary(subpath) + return {"is_binary": is_binary} + + +def _action_open_file(request: dict, ctx: WorkerContext) -> dict: + """Open a file and return its metadata. + + The actual file content is read and returned as part of the response + for simplicity. For very large files, fd passing via SCM_RIGHTS + could be added as an optimization. + """ + fsp_name = request["fsp_name"] + subpath = request.get("subpath", "") + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error} + + from fileglancer.filestore import RootCheckError + from fileglancer.utils import guess_content_type + + try: + file_info = filestore.get_file_info(subpath) + if file_info.is_dir: + return {"error": "Cannot download directory content", "status_code": 400} + + file_name = subpath.split('/')[-1] if subpath else '' + content_type = guess_content_type(file_name) + full_path = filestore._check_path_in_root(subpath) + + return { + "full_path": full_path, + "file_size": file_info.size, + "content_type": content_type, + } + except RootCheckError as e: + from fileglancer import database as db + with db.get_db_session(ctx.db_url) as session: + match = db.find_fsp_from_absolute_path(session, e.full_path) + if match: + fsp, relative_subpath = match + return {"redirect": True, "fsp_name": fsp.name, "subpath": relative_subpath or ""} + return {"error": str(e), "status_code": 400} + except FileNotFoundError: + return {"error": "File or directory not found", "status_code": 404} + except PermissionError: + return {"error": "Permission denied", "status_code": 403} + + +def _action_head_file(request: dict, ctx: WorkerContext) -> dict: + """Get file metadata and binary check for HEAD requests.""" + fsp_name = request["fsp_name"] + subpath = request.get("subpath", "") + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error} + + from fileglancer.filestore import RootCheckError + from fileglancer.utils import guess_content_type + + try: + file_info = filestore.get_file_info(subpath, current_user=ctx.username) + file_name = subpath.split('/')[-1] if subpath else '' + content_type = guess_content_type(file_name) + is_binary = filestore.check_is_binary(subpath) if not file_info.is_dir else False + + return { + "info": json.loads(file_info.model_dump_json()), + "content_type": content_type, + "is_binary": is_binary, + } + except RootCheckError as e: + from fileglancer import database as db + with db.get_db_session(ctx.db_url) as session: + match = db.find_fsp_from_absolute_path(session, e.full_path) + if match: + fsp, relative_subpath = match + return {"redirect": True, "fsp_name": fsp.name, "subpath": relative_subpath or ""} + return {"error": str(e), "status_code": 400} + except FileNotFoundError: + return {"error": "File or directory not found", "status_code": 404} + except PermissionError: + return {"error": "Permission denied", "status_code": 403} + + +def _action_create_dir(request: dict, ctx: WorkerContext) -> dict: + """Create a directory.""" + fsp_name = request["fsp_name"] + subpath = request["subpath"] + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error} + + try: + filestore.create_dir(subpath) + return {"ok": True} + except FileExistsError: + return {"error": "A file or directory with this name already exists", "status_code": 409} + except PermissionError as e: + return {"error": str(e), "status_code": 403} + + +def _action_create_file(request: dict, ctx: WorkerContext) -> dict: + """Create an empty file.""" + fsp_name = request["fsp_name"] + subpath = request["subpath"] + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error} + + try: + filestore.create_empty_file(subpath) + return {"ok": True} + except FileExistsError: + return {"error": "A file or directory with this name already exists", "status_code": 409} + except PermissionError as e: + return {"error": str(e), "status_code": 403} + + +def _action_rename(request: dict, ctx: WorkerContext) -> dict: + """Rename a file or directory.""" + fsp_name = request["fsp_name"] + old_path = request["old_path"] + new_path = request["new_path"] + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error} + + try: + filestore.rename_file_or_dir(old_path, new_path) + return {"ok": True} + except PermissionError as e: + return {"error": str(e), "status_code": 403} + except OSError as e: + return {"error": str(e), "status_code": 500} + + +def _action_delete(request: dict, ctx: WorkerContext) -> dict: + """Delete a file or directory.""" + fsp_name = request["fsp_name"] + subpath = request["subpath"] + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error} + + try: + filestore.remove_file_or_dir(subpath) + return {"ok": True} + except PermissionError as e: + return {"error": str(e), "status_code": 403} + except OSError as e: + return {"error": str(e), "status_code": 500} + + +def _action_chmod(request: dict, ctx: WorkerContext) -> dict: + """Change file permissions.""" + fsp_name = request["fsp_name"] + subpath = request["subpath"] + permissions = request["permissions"] + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error} + + try: + filestore.change_file_permissions(subpath, permissions) + return {"ok": True} + except PermissionError as e: + return {"error": str(e), "status_code": 403} + except OSError as e: + return {"error": str(e), "status_code": 500} + + +def _action_update_file(request: dict, ctx: WorkerContext) -> dict: + """Handle rename and/or permission change on a file.""" + fsp_name = request["fsp_name"] + subpath = request.get("subpath", "") + new_path = request.get("new_path") + new_permissions = request.get("new_permissions") + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error} + + try: + old_file_info = filestore.get_file_info(subpath, ctx.username) + result = {"info": json.loads(old_file_info.model_dump_json())} + + if new_permissions is not None and new_permissions != old_file_info.permissions: + filestore.change_file_permissions(subpath, new_permissions) + + if new_path is not None and new_path != old_file_info.path: + filestore.rename_file_or_dir(old_file_info.path, new_path) + + result["ok"] = True + return result + except PermissionError as e: + return {"error": str(e), "status_code": 403} + except OSError as e: + return {"error": str(e), "status_code": 500} + + +def _action_validate_paths(request: dict, ctx: WorkerContext) -> dict: + """Validate file/directory paths for app parameters.""" + from fileglancer.apps.core import validate_path_in_filestore + from fileglancer import database as db + + paths = request["paths"] + errors = {} + with db.get_db_session(ctx.db_url) as session: + for param_key, path_value in paths.items(): + error = validate_path_in_filestore(path_value, session) + if error: + errors[param_key] = error + return {"errors": errors} + + +def _action_get_profile(request: dict, ctx: WorkerContext) -> dict: + """Get user profile information.""" + import grp as _grp + + from fileglancer import database as db + + username = ctx.username + + with db.get_db_session(ctx.db_url) as session: + paths = db.get_file_share_paths(session) + + home_fsp = next((fsp for fsp in paths if fsp.mount_path in ('~', '~/')), None) + if home_fsp: + home_directory_name = "." + else: + home_directory_path = os.path.expanduser(f"~{username}") + home_parent = os.path.dirname(home_directory_path) + home_fsp = next((fsp for fsp in paths if fsp.mount_path == home_parent), None) + home_directory_name = os.path.basename(home_directory_path) + + home_fsp_name = home_fsp.name if home_fsp else None + + user_groups = [] + try: + user_info = pwd.getpwnam(username) + all_groups = _grp.getgrall() + for group in all_groups: + if username in group.gr_mem: + user_groups.append(group.gr_name) + primary_group = _grp.getgrgid(user_info.pw_gid).gr_name + if primary_group not in user_groups: + user_groups.append(primary_group) + except Exception as e: + logger.error(f"Error getting groups for user {username}: {e}") + + return { + "username": username, + "homeFileSharePathName": home_fsp_name, + "homeDirectoryName": home_directory_name, + "groups": user_groups, + } + + +# --------------------------------------------------------------------------- +# Action handlers — SSH keys +# --------------------------------------------------------------------------- + +def _action_list_ssh_keys(request: dict, ctx: WorkerContext) -> dict: + """List SSH keys.""" + from fileglancer import sshkeys + try: + ssh_dir = sshkeys.get_ssh_directory() + keys = sshkeys.list_ssh_keys(ssh_dir) + return {"keys": [k.model_dump() for k in keys]} + except Exception as e: + return {"error": str(e), "status_code": 500} + + +def _action_generate_ssh_key(request: dict, ctx: WorkerContext) -> dict: + """Generate a temporary SSH key and authorize it.""" + from fileglancer import sshkeys + try: + ssh_dir = sshkeys.get_ssh_directory() + passphrase = request.get("passphrase") + result = sshkeys.generate_temp_key_and_authorize(ssh_dir, passphrase) + # TempKeyResponse is a Response object; extract the data we need + return { + "private_key": result.body.decode() if hasattr(result, 'body') else str(result), + "fingerprint": result.headers.get("X-SSH-Key-Fingerprint", "") if hasattr(result, 'headers') else "", + "comment": result.headers.get("X-SSH-Key-Comment", "") if hasattr(result, 'headers') else "", + } + except Exception as e: + return {"error": str(e), "status_code": 500} + + +# --------------------------------------------------------------------------- +# Action handlers — job files +# --------------------------------------------------------------------------- + +def _action_get_job_file(request: dict, ctx: WorkerContext) -> dict: + """Read job file content (script, stdout, stderr).""" + from fileglancer.apps.core import get_job_file_content + job_id = request["job_id"] + file_type = request["file_type"] + content = get_job_file_content(job_id, ctx.username, file_type) + if content is None: + return {"content": None} + return {"content": content} + + +def _action_get_job_file_paths(request: dict, ctx: WorkerContext) -> dict: + """Get job file path info.""" + from fileglancer.apps.core import get_job_file_paths + from fileglancer import database as db + from fileglancer.settings import get_settings + + settings = get_settings() + job_id = request["job_id"] + + with db.get_db_session(settings.db_url) as session: + db_job = db.get_job(session, job_id, ctx.username) + if db_job is None: + return {"error": f"Job {job_id} not found", "status_code": 404} + files = get_job_file_paths(db_job) + return {"files": files} + + +def _action_get_service_url(request: dict, ctx: WorkerContext) -> dict: + """Read service URL from job work directory.""" + from fileglancer.apps.core import get_service_url + from fileglancer import database as db + from fileglancer.settings import get_settings + + settings = get_settings() + job_id = request["job_id"] + + with db.get_db_session(settings.db_url) as session: + db_job = db.get_job(session, job_id, ctx.username) + if db_job is None: + return {"error": f"Job {job_id} not found", "status_code": 404} + url = get_service_url(db_job) + return {"service_url": url} + + +# --------------------------------------------------------------------------- +# Action handlers — S3 proxy +# --------------------------------------------------------------------------- + +def _action_s3_list_objects(request: dict, ctx: WorkerContext) -> dict: + """S3-compatible list objects.""" + from x2s3.client_file import FileProxyClient + + mount_path = request["mount_path"] + target_name = request["target_name"] + buffer_size = request.get("buffer_size", 256 * 1024) + + client = FileProxyClient( + proxy_kwargs={"target_name": target_name}, + path=mount_path, + buffer_size=buffer_size, + ) + + # list_objects_v2 is async def but does only sync I/O + result = _run_async(client.list_objects_v2( + continuation_token=request.get("continuation_token"), + delimiter=request.get("delimiter"), + encoding_type=request.get("encoding_type"), + fetch_owner=request.get("fetch_owner"), + max_keys=request.get("max_keys", 1000), + prefix=request.get("prefix"), + start_after=request.get("start_after"), + )) + # Result is a fastapi Response object + return {"body": result.body.decode(), "media_type": result.media_type, "status_code": result.status_code} + + +def _action_s3_head_object(request: dict, ctx: WorkerContext) -> dict: + """S3-compatible head object.""" + from x2s3.client_file import FileProxyClient + + mount_path = request["mount_path"] + target_name = request["target_name"] + path = request["path"] + + client = FileProxyClient( + proxy_kwargs={"target_name": target_name}, + path=mount_path, + ) + + result = _run_async(client.head_object(path)) + headers = dict(result.headers) if hasattr(result, 'headers') else {} + return {"headers": headers, "status_code": result.status_code} + + +def _action_s3_open_object(request: dict, ctx: WorkerContext) -> dict: + """S3-compatible open object — open the file and return metadata. + + The actual file descriptor stays open in this process; + for now we return the path so the main process can open it + with the fd still valid via the user context. + Actually: the worker opens the file and returns metadata. + The main process will open its own fd (the worker already proved + access is valid by running as the user). + """ + from x2s3.client_file import FileProxyClient, FileObjectHandle + from x2s3.utils import get_nosuchkey_response, get_error_response + + mount_path = request["mount_path"] + target_name = request["target_name"] + path = request["path"] + range_header = request.get("range_header") + + client = FileProxyClient( + proxy_kwargs={"target_name": target_name}, + path=mount_path, + buffer_size=request.get("buffer_size", 256 * 1024), + ) + + result = _run_async(client.open_object(path, range_header)) + + if isinstance(result, FileObjectHandle): + # Return metadata; the file handle stays open in the worker + # The main process will need to stream from here + response = { + "type": "handle", + "status_code": result.status_code, + "headers": result.headers, + "media_type": result.media_type, + "content_length": result.content_length, + "key": result.key, + "target_name": result.target_name, + "start": result.start, + "end": result.end, + # Include the resolved path so main process can open it + "resolved_path": client._safe_path(path), + } + result.close() + return response + else: + # Error response + return { + "type": "error_response", + "body": result.body.decode() if hasattr(result, 'body') else "", + "status_code": result.status_code, + "headers": dict(result.headers) if hasattr(result, 'headers') else {}, + } + + +# --------------------------------------------------------------------------- +# Action handlers — proxied path validation +# --------------------------------------------------------------------------- + +def _action_validate_proxied_path(request: dict, ctx: WorkerContext) -> dict: + """Validate that the user can access a proxied path. + + Runs within the user's context (the worker IS the user), so + filesystem permission checks just work. + """ + fsp_name = request["fsp_name"] + path = request["path"] + + filestore, error = _get_filestore(fsp_name, ctx.db_url) + if filestore is None: + return {"error": error} + + try: + filestore.get_file_info(path) + return {"ok": True} + except (FileNotFoundError, PermissionError) as e: + return {"error": str(e)} + + +# --------------------------------------------------------------------------- +# Action handlers — cluster operations (absorbed from apps/worker.py) +# --------------------------------------------------------------------------- + +def _action_submit(request: dict, ctx: WorkerContext) -> dict: + """Create work dir, symlink repo, submit job via py-cluster-api.""" + from cluster_api import create_executor, ResourceSpec + + config = request["cluster_config"] + config.pop("extra_args", None) + + executor = create_executor(**config) + + work_dir = Path(request["work_dir"]) + work_dir.mkdir(parents=True, exist_ok=True) + + cached_repo_dir = request["cached_repo_dir"] + repo_link = work_dir / "repo" + if repo_link.is_symlink() or repo_link.exists(): + repo_link.unlink() + repo_link.symlink_to(cached_repo_dir) + + res = request["resources"] + resource_spec = ResourceSpec( + cpus=res.get("cpus"), + gpus=res.get("gpus"), + memory=res.get("memory"), + walltime=res.get("walltime"), + queue=res.get("queue"), + work_dir=res["work_dir"], + stdout_path=res.get("stdout_path"), + stderr_path=res.get("stderr_path"), + extra_directives=res.get("extra_directives"), + extra_args=res.get("extra_args"), + ) + + job = _run_async(executor.submit( + command=request["command"], + name=request["job_name"], + resources=resource_spec, + )) + + return {"job_id": job.job_id, "script_path": job.script_path} + + +def _action_cancel(request: dict, ctx: WorkerContext) -> dict: + """Cancel a cluster job via py-cluster-api.""" + from cluster_api import create_executor + + config = request["cluster_config"] + config.pop("extra_args", None) + + executor = create_executor(**config) + _run_async(executor.cancel(request["job_id"])) + return {"status": "ok"} + + +def _action_poll(request: dict, ctx: WorkerContext) -> dict: + """Poll job statuses via py-cluster-api.""" + from cluster_api import create_executor + from cluster_api._types import JobRecord, JobStatus + + config = request["cluster_config"] + config.pop("extra_args", None) + + executor = create_executor(**config) + + known_statuses = request.get("job_statuses", {}) + for cid in request["cluster_job_ids"]: + db_status = known_statuses.get(cid, "PENDING").lower() + try: + seed_status = JobStatus(db_status) + except ValueError: + seed_status = JobStatus.PENDING + executor._jobs[cid] = JobRecord( + job_id=cid, + name="", + command="", + status=seed_status, + ) + + _run_async(executor.poll()) + + jobs = {} + for cid, record in executor.jobs.items(): + jobs[cid] = { + "status": record.status.value, + "exit_code": record.exit_code, + "exec_host": record.exec_host, + "start_time": record.start_time.isoformat() if record.start_time else None, + "finish_time": record.finish_time.isoformat() if record.finish_time else None, + } + + return {"jobs": jobs} + + +def _action_reconnect(request: dict, ctx: WorkerContext) -> dict: + """Reconnect to existing jobs via py-cluster-api.""" + from cluster_api import create_executor + + config = request["cluster_config"] + config.pop("extra_args", None) + + executor = create_executor(**config) + reconnected = _run_async(executor.reconnect()) + + jobs = {} + for record in reconnected: + jobs[record.job_id] = { + "status": record.status.value, + "name": record.name, + "exit_code": record.exit_code, + "exec_host": record.exec_host, + "start_time": record.start_time.isoformat() if record.start_time else None, + "finish_time": record.finish_time.isoformat() if record.finish_time else None, + } + + return {"jobs": jobs} + + +# --------------------------------------------------------------------------- +# Action handlers — git/manifest operations (absorbed from apps/worker.py) +# --------------------------------------------------------------------------- + +def _action_ensure_repo(request: dict, ctx: WorkerContext) -> dict: + """Clone or update a GitHub repo in the current user's cache.""" + from fileglancer.apps.core import _ensure_repo_cache + repo_dir = _run_async(_ensure_repo_cache( + url=request["url"], + pull=request.get("pull", False), + )) + return {"repo_dir": str(repo_dir)} + + +def _action_discover_manifests(request: dict, ctx: WorkerContext) -> dict: + """Clone/pull repo and discover all manifests.""" + from fileglancer.apps.core import _ensure_repo_cache, _find_manifests_in_repo + repo_dir = _run_async(_ensure_repo_cache( + url=request["url"], + pull=True, + )) + results = _find_manifests_in_repo(repo_dir) + return { + "manifests": [ + {"path": path, "manifest": manifest.model_dump(mode="json")} + for path, manifest in results + ] + } + + +def _action_read_manifest(request: dict, ctx: WorkerContext) -> dict: + """Fetch and read a single manifest from a cached repo.""" + from fileglancer.apps.core import _ensure_repo_cache, _read_manifest_file + repo_dir = _run_async(_ensure_repo_cache( + url=request["url"], + pull=request.get("pull", False), + )) + manifest_path = request.get("manifest_path", "") + target_dir = repo_dir / manifest_path if manifest_path else repo_dir + manifest = _read_manifest_file(target_dir) + return {"manifest": manifest.model_dump(mode="json")} + + +# --------------------------------------------------------------------------- +# Action registry +# --------------------------------------------------------------------------- + +_ACTIONS: dict[str, Any] = { + # File operations + "list_dir": _action_list_dir, + "list_dir_paged": _action_list_dir_paged, + "get_file_info": _action_get_file_info, + "check_binary": _action_check_binary, + "open_file": _action_open_file, + "head_file": _action_head_file, + "create_dir": _action_create_dir, + "create_file": _action_create_file, + "rename": _action_rename, + "delete": _action_delete, + "chmod": _action_chmod, + "update_file": _action_update_file, + "validate_paths": _action_validate_paths, + "validate_proxied_path": _action_validate_proxied_path, + "get_profile": _action_get_profile, + # SSH keys + "list_ssh_keys": _action_list_ssh_keys, + "generate_ssh_key": _action_generate_ssh_key, + # Job files + "get_job_file": _action_get_job_file, + "get_job_file_paths": _action_get_job_file_paths, + "get_service_url": _action_get_service_url, + # S3 proxy + "s3_list_objects": _action_s3_list_objects, + "s3_head_object": _action_s3_head_object, + "s3_open_object": _action_s3_open_object, + # Cluster operations + "submit": _action_submit, + "cancel": _action_cancel, + "poll": _action_poll, + "reconnect": _action_reconnect, + # Git/manifest operations + "ensure_repo": _action_ensure_repo, + "discover_manifests": _action_discover_manifests, + "read_manifest": _action_read_manifest, +} + + +# --------------------------------------------------------------------------- +# Worker context and main loop +# --------------------------------------------------------------------------- + +class WorkerContext: + """Holds per-worker state.""" + + def __init__(self, username: str, db_url: str): + self.username = username + self.db_url = db_url + + +def main(): + """Worker entry point — run the request/response loop.""" + + # Set up orphan prevention + _set_pdeathsig() + + # Configure logging + log_level = os.environ.get("FGC_LOG_LEVEL", "INFO").upper() + + # Use loguru for worker logging, output to stderr + logger.remove() + logger.add(sys.stderr, level=log_level) + + # Configure cluster_api logging + handler = logging.StreamHandler(sys.stderr) + handler.setFormatter(logging.Formatter( + "%(levelname)s | %(name)s:%(funcName)s:%(lineno)d - %(message)s" + )) + cluster_logger = logging.getLogger("cluster_api") + cluster_logger.addHandler(handler) + cluster_logger.setLevel(log_level) + + # Get the socket fd from environment + fd = int(os.environ["FGC_WORKER_FD"]) + sock = socket.fromfd(fd, socket.AF_UNIX, socket.SOCK_STREAM) + os.close(fd) # close the original fd, we have a dup now + + # Determine username + uid = os.getuid() + try: + username = pwd.getpwuid(uid).pw_name + except KeyError: + username = str(uid) + + db_url = os.environ.get("FGC_DB_URL", "") + ctx = WorkerContext(username=username, db_url=db_url) + + logger.info( + f"Worker started for {username} " + f"(uid={uid} euid={os.geteuid()} pid={os.getpid()})" + ) + + # Main request/response loop + while True: + try: + request = _recv(sock) + except ConnectionError: + logger.info("Parent connection closed, exiting") + break + + action = request.get("action") + + if action == "shutdown": + logger.info(f"Shutdown requested, exiting") + break + + handler = _ACTIONS.get(action) + if handler is None: + _send(sock, {"error": f"Unknown action: {action}"}) + continue + + try: + result = handler(request, ctx) + _send(sock, result) + except Exception as e: + logger.exception(f"Error handling action {action}") + _send(sock, {"error": str(e)}) + + sock.close() + logger.info("Worker exiting") + + +if __name__ == "__main__": + main() diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py new file mode 100644 index 00000000..49018000 --- /dev/null +++ b/fileglancer/worker_pool.py @@ -0,0 +1,318 @@ +"""Per-user persistent worker pool. + +Manages a pool of long-lived subprocess workers, one per active user. +Each worker runs with the target user's real UID/GID/groups (set at +fork time via subprocess.Popen kwargs), so the main Uvicorn process +never calls seteuid/setegid/setgroups. + +Workers communicate with the main process over a Unix socketpair using +a length-prefixed JSON protocol. The main process dispatches all +user-scoped work (file I/O, cluster ops, git ops) to the appropriate +worker, keeping the async event loop free. + +Usage from server.py: + + pool = WorkerPool(settings) + worker = await pool.get_worker(username) + result = await worker.execute("list_dir", fsp_name="home", subpath="Documents") +""" + +from __future__ import annotations + +import asyncio +import ctypes +import ctypes.util +import grp +import json +import os +import pwd +import struct +import subprocess +import sys +import time +from typing import Any, Optional + +from loguru import logger + +from fileglancer.settings import Settings + + +# Length-prefix format: 4-byte big-endian unsigned int +_HEADER_FMT = "!I" +_HEADER_SIZE = struct.calcsize(_HEADER_FMT) +_MAX_MESSAGE_SIZE = 64 * 1024 * 1024 # 64 MB safety limit + + +class WorkerError(Exception): + """Raised when a worker returns an error response.""" + pass + + +class WorkerDead(Exception): + """Raised when the worker subprocess has died unexpectedly.""" + pass + + +class UserWorker: + """Wraps a single persistent worker subprocess for one user. + + IPC uses a Unix socketpair with length-prefixed JSON messages. + The worker handles one request at a time (serial). + """ + + def __init__(self, username: str, process: subprocess.Popen, + reader: asyncio.StreamReader, writer: asyncio.StreamWriter): + self.username = username + self.process = process + self.reader = reader + self.writer = writer + self.last_activity = time.monotonic() + self._busy = False + + @property + def is_alive(self) -> bool: + return self.process.poll() is None + + @property + def is_busy(self) -> bool: + return self._busy + + async def execute(self, action: str, **kwargs) -> Any: + """Send a request to the worker and return the parsed response. + + Raises WorkerError on application-level errors from the worker. + Raises WorkerDead if the subprocess has exited. + """ + if not self.is_alive: + raise WorkerDead(f"Worker for {self.username} is dead (rc={self.process.returncode})") + + self._busy = True + self.last_activity = time.monotonic() + try: + request = {"action": action, **kwargs} + await self._send(request) + response = await self._recv() + + if response.get("error"): + raise WorkerError(response["error"]) + + return response + except (BrokenPipeError, ConnectionResetError, asyncio.IncompleteReadError) as e: + raise WorkerDead(f"Worker for {self.username} connection lost: {e}") from e + finally: + self._busy = False + self.last_activity = time.monotonic() + + async def _send(self, data: dict): + payload = json.dumps(data).encode() + header = struct.pack(_HEADER_FMT, len(payload)) + self.writer.write(header + payload) + await self.writer.drain() + + async def _recv(self) -> dict: + header = await self.reader.readexactly(_HEADER_SIZE) + (length,) = struct.unpack(_HEADER_FMT, header) + if length > _MAX_MESSAGE_SIZE: + raise WorkerError(f"Response too large: {length} bytes") + payload = await self.reader.readexactly(length) + return json.loads(payload) + + async def shutdown(self, timeout: float = 5.0): + """Ask the worker to shut down gracefully, then force-kill if needed.""" + if not self.is_alive: + return + try: + await self._send({"action": "shutdown"}) + except (BrokenPipeError, ConnectionResetError, OSError): + pass + + # Wait for clean exit + try: + await asyncio.wait_for( + asyncio.get_event_loop().run_in_executor(None, self.process.wait), + timeout=timeout, + ) + except asyncio.TimeoutError: + logger.warning(f"Worker for {self.username} did not exit in {timeout}s, killing") + self.process.kill() + self.process.wait() + + self.writer.close() + + +class WorkerPool: + """Manages per-user persistent worker subprocesses. + + Workers are spawned on demand and evicted after idle timeout. + """ + + def __init__(self, settings: Settings): + self.settings = settings + self._workers: dict[str, UserWorker] = {} + self._locks: dict[str, asyncio.Lock] = {} + self._eviction_task: Optional[asyncio.Task] = None + self.max_workers = 50 + self.idle_timeout = 300 # seconds + + def _get_lock(self, username: str) -> asyncio.Lock: + if username not in self._locks: + self._locks[username] = asyncio.Lock() + return self._locks[username] + + async def get_worker(self, username: str) -> UserWorker: + """Get or create a worker for the given user.""" + # Fast path: worker exists and is alive + worker = self._workers.get(username) + if worker is not None and worker.is_alive: + worker.last_activity = time.monotonic() + return worker + + # Slow path: need to create or replace worker + async with self._get_lock(username): + # Double-check after acquiring lock + worker = self._workers.get(username) + if worker is not None and worker.is_alive: + worker.last_activity = time.monotonic() + return worker + + # Clean up dead worker if present + if worker is not None: + logger.warning(f"Worker for {username} found dead, replacing") + self._workers.pop(username, None) + + # Evict LRU worker if at capacity + if len(self._workers) >= self.max_workers: + await self._evict_lru() + + # Spawn new worker + new_worker = await self._spawn_worker(username) + self._workers[username] = new_worker + return new_worker + + async def _spawn_worker(self, username: str) -> UserWorker: + """Spawn a new persistent worker subprocess for the given user.""" + pw = pwd.getpwnam(username) + + # Build identity kwargs (only switch if running as root) + identity_kwargs: dict = {} + if os.geteuid() == 0: + groups = [g.gr_gid for g in grp.getgrall() if username in g.gr_mem] + if pw.pw_gid not in groups: + groups.append(pw.pw_gid) + identity_kwargs = { + "user": pw.pw_uid, + "group": pw.pw_gid, + "extra_groups": groups, + } + + # Create Unix socketpair for IPC + parent_sock, child_sock = os.socketpair() + + env = { + **os.environ, + "HOME": pw.pw_dir, + "FGC_LOG_LEVEL": self.settings.log_level, + "FGC_DB_URL": self.settings.db_url, + "FGC_WORKER_FD": str(child_sock.fileno()), + } + + logger.info( + f"Spawning persistent worker for {username} " + f"(uid={pw.pw_uid} gid={pw.pw_gid})" + ) + + process = subprocess.Popen( + [sys.executable, "-m", "fileglancer.user_worker"], + env=env, + pass_fds=(child_sock.fileno(),), + stderr=subprocess.PIPE, + **identity_kwargs, + ) + + # Close child's end in the parent + child_sock.close() + + # Wrap the parent socket fd in asyncio streams + loop = asyncio.get_event_loop() + reader, writer = await asyncio.open_connection(sock=parent_sock) + + # Start a background task to forward worker stderr to loguru + asyncio.create_task(self._forward_stderr(username, process)) + + worker = UserWorker(username, process, reader, writer) + return worker + + async def _forward_stderr(self, username: str, process: subprocess.Popen): + """Forward worker stderr lines to loguru in the background.""" + try: + loop = asyncio.get_event_loop() + while True: + line = await loop.run_in_executor(None, process.stderr.readline) + if not line: + break + logger.debug(f"[worker:{username}] {line.decode().rstrip()}") + except Exception: + pass + + async def _evict_lru(self): + """Evict the least-recently-used idle worker.""" + candidates = [ + (w.last_activity, name, w) + for name, w in self._workers.items() + if not w.is_busy + ] + if not candidates: + logger.warning("Worker pool at capacity with no idle workers to evict") + return + + candidates.sort() + _, name, worker = candidates[0] + logger.info(f"Evicting LRU worker for {name}") + await worker.shutdown() + self._workers.pop(name, None) + + async def start_eviction_loop(self): + """Start the background eviction loop.""" + if self._eviction_task is None or self._eviction_task.done(): + self._eviction_task = asyncio.create_task(self._eviction_loop()) + + async def _eviction_loop(self): + """Periodically evict idle workers.""" + while True: + await asyncio.sleep(60) + now = time.monotonic() + to_evict = [] + for name, worker in list(self._workers.items()): + if not worker.is_busy and (now - worker.last_activity) > self.idle_timeout: + to_evict.append(name) + elif not worker.is_alive: + to_evict.append(name) + + for name in to_evict: + worker = self._workers.pop(name, None) + if worker is not None: + if worker.is_alive: + logger.info(f"Evicting idle worker for {name}") + await worker.shutdown() + else: + logger.info(f"Removing dead worker for {name}") + + async def shutdown_all(self): + """Shut down all workers (called during server shutdown).""" + if self._eviction_task and not self._eviction_task.done(): + self._eviction_task.cancel() + try: + await self._eviction_task + except asyncio.CancelledError: + pass + + tasks = [] + for name, worker in list(self._workers.items()): + logger.info(f"Shutting down worker for {name}") + tasks.append(worker.shutdown(timeout=10.0)) + + if tasks: + await asyncio.gather(*tasks, return_exceptions=True) + + self._workers.clear() + self._locks.clear() From e88956cc46ac17e728fc541a1aacae4b77293ab1 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 11:08:15 -0400 Subject: [PATCH 02/39] pass file descriptors using SCM_RIGHTS for performance --- fileglancer/server.py | 41 +++++++------- fileglancer/user_worker.py | 67 ++++++++++++++++------- fileglancer/worker_pool.py | 107 +++++++++++++++++++++++++++---------- 3 files changed, 147 insertions(+), 68 deletions(-) diff --git a/fileglancer/server.py b/fileglancer/server.py index 3f812223..0ec707a2 100644 --- a/fileglancer/server.py +++ b/fileglancer/server.py @@ -231,6 +231,11 @@ async def _worker_exec(username: str, action: str, **kwargs): When use_access_flags=False (dev/test mode), runs the action directly in the current process since no identity switching is needed. + If the worker opens a file and passes back a file descriptor (e.g. + open_file, s3_open_object), the response dict will contain a + ``_file_handle`` key with an open file object. Callers that don't + need it can ignore this key. + Raises HTTPException on worker-level errors or dead workers. """ if worker_pool is not None: @@ -250,7 +255,10 @@ async def _worker_exec(username: str, action: str, **kwargs): raise HTTPException(status_code=500, detail=f"Unknown action: {action}") ctx = WorkerContext(username=username, db_url=settings.db_url) request = {"action": action, **kwargs} - return handler(request, ctx) + result = handler(request, ctx) + # Strip the raw fd (not meaningful in-process), keep _file_handle + result.pop("_fd", None) + return result def _resolve_proxy_info(sharing_key: str, captured_path: str) -> Tuple[dict | Response, str]: """Resolve a sharing key to proxy info (mount_path, target_name, username, subpath). @@ -1140,20 +1148,16 @@ async def target_dispatcher(request: Request, else: range_header = request.headers.get("range") - result = await _worker_exec(info["username"], "s3_open_object", - mount_path=info["mount_path"], - target_name=info["target_name"], - path=subpath, - range_header=range_header) - - if result.get("type") == "handle": - # Worker validated access and returned file metadata - # Open the file in main process (root can read anything) - resolved_path = result["resolved_path"] - if resolved_path is None: - return get_error_response(404, "NoSuchKey", "File not found", subpath) + result = await _worker_exec( + info["username"], "s3_open_object", + mount_path=info["mount_path"], + target_name=info["target_name"], + path=subpath, + range_header=range_header) - file_handle = open(resolved_path, "rb") + file_handle = result.pop("_file_handle", None) + if result.get("type") == "handle" and file_handle is not None: + # Worker opened the file and passed the fd via SCM_RIGHTS from x2s3.client_file import FileObjectHandle, file_iterator handle = FileObjectHandle( target_name=result["target_name"], @@ -1317,7 +1321,7 @@ async def get_file_content(request: Request, path_name: str, subpath: Optional[s else: filestore_name, _, subpath = path_name.partition('/') - # Worker validates path and returns metadata (runs as user) + # Worker opens the file as the user and passes the fd back result = await _worker_exec(username, "open_file", fsp_name=filestore_name, subpath=subpath) if result.get("redirect"): @@ -1328,15 +1332,12 @@ async def get_file_content(request: Request, path_name: str, subpath: Optional[s if "error" in result: raise HTTPException(status_code=result.get("status_code", 500), detail=result["error"]) - full_path = result["full_path"] + file_handle = result.get("_file_handle") + file_size = result["file_size"] content_type = result["content_type"] file_name = subpath.split('/')[-1] if subpath else '' - # Open file in main process — the worker validated access; - # main process runs as root so it can open the validated path - file_handle = open(full_path, 'rb') - range_header = request.headers.get('Range') if range_header: diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index 0777085b..db77c095 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -86,6 +86,21 @@ def _send(sock: socket.socket, data: dict): sock.sendall(header + payload) +def _send_with_fd(sock: socket.socket, data: dict, fd: int): + """Send a length-prefixed JSON message with a file descriptor via SCM_RIGHTS.""" + import array as _array + + payload = json.dumps(data, default=str).encode() + header = struct.pack(_HEADER_FMT, len(payload)) + full_msg = header + payload + + fds = _array.array("i", [fd]) + sock.sendmsg( + [full_msg], + [(socket.SOL_SOCKET, socket.SCM_RIGHTS, fds)], + ) + + def _recv(sock: socket.socket) -> dict: """Receive a length-prefixed JSON message.""" header = _recvall(sock, _HEADER_SIZE) @@ -261,11 +276,11 @@ def _action_check_binary(request: dict, ctx: WorkerContext) -> dict: def _action_open_file(request: dict, ctx: WorkerContext) -> dict: - """Open a file and return its metadata. + """Open a file and return its metadata + open file descriptor. - The actual file content is read and returned as part of the response - for simplicity. For very large files, fd passing via SCM_RIGHTS - could be added as an optimization. + The worker opens the file as the user and passes the fd back to the + main process via SCM_RIGHTS. The response includes "_fd" key with the + fd number — the main loop uses _send_with_fd() for these responses. """ fsp_name = request["fsp_name"] subpath = request.get("subpath", "") @@ -286,10 +301,15 @@ def _action_open_file(request: dict, ctx: WorkerContext) -> dict: content_type = guess_content_type(file_name) full_path = filestore._check_path_in_root(subpath) + # Open the file — the fd retains user's access rights + file_handle = open(full_path, 'rb') + fd = file_handle.fileno() + return { - "full_path": full_path, "file_size": file_info.size, "content_type": content_type, + "_fd": fd, + "_file_handle": file_handle, # kept alive until fd is sent } except RootCheckError as e: from fileglancer import database as db @@ -653,17 +673,13 @@ def _action_s3_head_object(request: dict, ctx: WorkerContext) -> dict: def _action_s3_open_object(request: dict, ctx: WorkerContext) -> dict: - """S3-compatible open object — open the file and return metadata. - - The actual file descriptor stays open in this process; - for now we return the path so the main process can open it - with the fd still valid via the user context. - Actually: the worker opens the file and returns metadata. - The main process will open its own fd (the worker already proved - access is valid by running as the user). + """S3-compatible open object — open the file and pass the fd back. + + The worker opens the file as the user via FileProxyClient.open_object(), + then passes the file descriptor to the main process via SCM_RIGHTS. + The main process wraps it in a StreamingResponse. """ from x2s3.client_file import FileProxyClient, FileObjectHandle - from x2s3.utils import get_nosuchkey_response, get_error_response mount_path = request["mount_path"] target_name = request["target_name"] @@ -679,8 +695,8 @@ def _action_s3_open_object(request: dict, ctx: WorkerContext) -> dict: result = _run_async(client.open_object(path, range_header)) if isinstance(result, FileObjectHandle): - # Return metadata; the file handle stays open in the worker - # The main process will need to stream from here + # Keep the file handle alive and pass the fd + fd = result.file_handle.fileno() response = { "type": "handle", "status_code": result.status_code, @@ -691,10 +707,11 @@ def _action_s3_open_object(request: dict, ctx: WorkerContext) -> dict: "target_name": result.target_name, "start": result.start, "end": result.end, - # Include the resolved path so main process can open it - "resolved_path": client._safe_path(path), + "_fd": fd, + "_file_handle": result.file_handle, # kept alive until fd is sent } - result.close() + # Don't close the handle — the fd needs to survive transfer + # The main process will close it after streaming return response else: # Error response @@ -1012,7 +1029,17 @@ def main(): try: result = handler(request, ctx) - _send(sock, result) + + # If the result contains a file descriptor, send it via SCM_RIGHTS + fd = result.pop("_fd", None) + file_handle = result.pop("_file_handle", None) + if fd is not None: + _send_with_fd(sock, result, fd) + # Close our copy of the fd — the main process has its own now + if file_handle is not None: + file_handle.close() + else: + _send(sock, result) except Exception as e: logger.exception(f"Error handling action {action}") _send(sock, {"error": str(e)}) diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index 49018000..3973ef6f 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -6,26 +6,30 @@ never calls seteuid/setegid/setgroups. Workers communicate with the main process over a Unix socketpair using -a length-prefixed JSON protocol. The main process dispatches all -user-scoped work (file I/O, cluster ops, git ops) to the appropriate -worker, keeping the async event loop free. +a length-prefixed JSON protocol. When a worker response includes a file +descriptor (e.g. an opened file for streaming), it arrives transparently +via SCM_RIGHTS — callers see a ``_file_handle`` key in the response dict. Usage from server.py: pool = WorkerPool(settings) worker = await pool.get_worker(username) result = await worker.execute("list_dir", fsp_name="home", subpath="Documents") + + # For actions that open files, the fd arrives automatically: + result = await worker.execute("open_file", fsp_name="home", subpath="data.bin") + file_handle = result.get("_file_handle") # open file object, or None """ from __future__ import annotations +import array import asyncio -import ctypes -import ctypes.util import grp import json import os import pwd +import socket import struct import subprocess import sys @@ -56,16 +60,17 @@ class WorkerDead(Exception): class UserWorker: """Wraps a single persistent worker subprocess for one user. - IPC uses a Unix socketpair with length-prefixed JSON messages. - The worker handles one request at a time (serial). + IPC uses a blocking Unix socket accessed from a thread (via + run_in_executor) so the async event loop is never blocked. + All receives use recvmsg(), which transparently handles both + plain messages and messages carrying file descriptors via SCM_RIGHTS. """ def __init__(self, username: str, process: subprocess.Popen, - reader: asyncio.StreamReader, writer: asyncio.StreamWriter): + sock: socket.socket): self.username = username self.process = process - self.reader = reader - self.writer = writer + self.sock = sock self.last_activity = time.monotonic() self._busy = False @@ -80,6 +85,9 @@ def is_busy(self) -> bool: async def execute(self, action: str, **kwargs) -> Any: """Send a request to the worker and return the parsed response. + If the worker sends a file descriptor (SCM_RIGHTS), the response + dict will contain a ``_file_handle`` key with an open file object. + Raises WorkerError on application-level errors from the worker. Raises WorkerDead if the subprocess has exited. """ @@ -90,39 +98,81 @@ async def execute(self, action: str, **kwargs) -> Any: self.last_activity = time.monotonic() try: request = {"action": action, **kwargs} - await self._send(request) - response = await self._recv() + loop = asyncio.get_event_loop() + response = await loop.run_in_executor( + None, self._send_and_recv, request) if response.get("error"): + # Close any fd that arrived with an error response + fh = response.pop("_file_handle", None) + if fh is not None: + fh.close() raise WorkerError(response["error"]) return response - except (BrokenPipeError, ConnectionResetError, asyncio.IncompleteReadError) as e: + except (BrokenPipeError, ConnectionResetError, OSError) as e: raise WorkerDead(f"Worker for {self.username} connection lost: {e}") from e finally: self._busy = False self.last_activity = time.monotonic() - async def _send(self, data: dict): - payload = json.dumps(data).encode() + def _send_and_recv(self, request: dict) -> dict: + """Send a request and receive the response (blocking, runs in thread). + + Uses recvmsg() which transparently handles optional SCM_RIGHTS fds. + """ + # Send + payload = json.dumps(request).encode() header = struct.pack(_HEADER_FMT, len(payload)) - self.writer.write(header + payload) - await self.writer.drain() + self.sock.sendall(header + payload) - async def _recv(self) -> dict: - header = await self.reader.readexactly(_HEADER_SIZE) + # Receive header + header = self._recvall(_HEADER_SIZE) (length,) = struct.unpack(_HEADER_FMT, header) if length > _MAX_MESSAGE_SIZE: raise WorkerError(f"Response too large: {length} bytes") - payload = await self.reader.readexactly(length) - return json.loads(payload) + + # Receive payload + optional fd via recvmsg + fds = array.array("i") + body = b"" + while len(body) < length: + msg, ancdata, flags, addr = self.sock.recvmsg( + length - len(body), + socket.CMSG_LEN(struct.calcsize("i")), # space for 1 fd + ) + if not msg: + raise ConnectionError("Worker closed connection mid-message") + body += msg + for cmsg_level, cmsg_type, cmsg_data in ancdata: + if cmsg_level == socket.SOL_SOCKET and cmsg_type == socket.SCM_RIGHTS: + fds.frombytes(cmsg_data[:len(cmsg_data) - (len(cmsg_data) % fds.itemsize)]) + + response = json.loads(body) + + # If an fd arrived, wrap it in a file object + if fds: + response["_file_handle"] = os.fdopen(fds[0], "rb") + + return response + + def _recvall(self, n: int) -> bytes: + """Read exactly n bytes from the socket.""" + data = bytearray() + while len(data) < n: + chunk = self.sock.recv(n - len(data)) + if not chunk: + raise ConnectionError("Worker closed connection") + data.extend(chunk) + return bytes(data) async def shutdown(self, timeout: float = 5.0): """Ask the worker to shut down gracefully, then force-kill if needed.""" if not self.is_alive: return try: - await self._send({"action": "shutdown"}) + payload = json.dumps({"action": "shutdown"}).encode() + header = struct.pack(_HEADER_FMT, len(payload)) + self.sock.sendall(header + payload) except (BrokenPipeError, ConnectionResetError, OSError): pass @@ -137,7 +187,10 @@ async def shutdown(self, timeout: float = 5.0): self.process.kill() self.process.wait() - self.writer.close() + try: + self.sock.close() + except OSError: + pass class WorkerPool: @@ -232,15 +285,13 @@ async def _spawn_worker(self, username: str) -> UserWorker: # Close child's end in the parent child_sock.close() - # Wrap the parent socket fd in asyncio streams - loop = asyncio.get_event_loop() - reader, writer = await asyncio.open_connection(sock=parent_sock) + # Keep the socket blocking — all I/O runs in a thread via run_in_executor + parent_sock.setblocking(True) # Start a background task to forward worker stderr to loguru asyncio.create_task(self._forward_stderr(username, process)) - worker = UserWorker(username, process, reader, writer) - return worker + return UserWorker(username, process, parent_sock) async def _forward_stderr(self, username: str, process: subprocess.Popen): """Forward worker stderr lines to loguru in the background.""" From e07027bd60511a8763a5ae0d4760db632641590b Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 11:36:03 -0400 Subject: [PATCH 03/39] fixed code --- fileglancer/worker_pool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index 3973ef6f..d28acebf 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -259,7 +259,7 @@ async def _spawn_worker(self, username: str) -> UserWorker: } # Create Unix socketpair for IPC - parent_sock, child_sock = os.socketpair() + parent_sock, child_sock = socket.socketpair() env = { **os.environ, From c4c6c4cdca63a3be44f3d980d220641250153f18 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 11:39:43 -0400 Subject: [PATCH 04/39] added worker tests and fixed SCM_RIGHTS bug --- fileglancer/worker_pool.py | 48 ++-- tests/test_worker.py | 532 +++++++++++++++++++++++++++++++++++++ 2 files changed, 559 insertions(+), 21 deletions(-) create mode 100644 tests/test_worker.py diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index d28acebf..19e198b3 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -119,34 +119,50 @@ async def execute(self, action: str, **kwargs) -> Any: def _send_and_recv(self, request: dict) -> dict: """Send a request and receive the response (blocking, runs in thread). - Uses recvmsg() which transparently handles optional SCM_RIGHTS fds. + All receives use recvmsg() so that SCM_RIGHTS file descriptors are + captured transparently — the ancillary data arrives with the first + bytes of the message, so we must use recvmsg for the header too. """ # Send payload = json.dumps(request).encode() header = struct.pack(_HEADER_FMT, len(payload)) self.sock.sendall(header + payload) - # Receive header - header = self._recvall(_HEADER_SIZE) - (length,) = struct.unpack(_HEADER_FMT, header) + # Receive header + payload + optional fd, all via recvmsg + fds = array.array("i") + raw = b"" + # First, read at least the header + while len(raw) < _HEADER_SIZE: + msg, ancdata, flags, addr = self.sock.recvmsg( + max(_HEADER_SIZE - len(raw), 4096), + socket.CMSG_LEN(struct.calcsize("i")), + ) + if not msg: + raise ConnectionError("Worker closed connection") + raw += msg + for cmsg_level, cmsg_type, cmsg_data in ancdata: + if cmsg_level == socket.SOL_SOCKET and cmsg_type == socket.SCM_RIGHTS: + fds.frombytes(cmsg_data[:len(cmsg_data) - (len(cmsg_data) % fds.itemsize)]) + + (length,) = struct.unpack(_HEADER_FMT, raw[:_HEADER_SIZE]) if length > _MAX_MESSAGE_SIZE: raise WorkerError(f"Response too large: {length} bytes") - # Receive payload + optional fd via recvmsg - fds = array.array("i") - body = b"" - while len(body) < length: + # We may have read some payload bytes with the header + total_needed = _HEADER_SIZE + length + while len(raw) < total_needed: msg, ancdata, flags, addr = self.sock.recvmsg( - length - len(body), - socket.CMSG_LEN(struct.calcsize("i")), # space for 1 fd + total_needed - len(raw), + socket.CMSG_LEN(struct.calcsize("i")), ) if not msg: raise ConnectionError("Worker closed connection mid-message") - body += msg + raw += msg for cmsg_level, cmsg_type, cmsg_data in ancdata: if cmsg_level == socket.SOL_SOCKET and cmsg_type == socket.SCM_RIGHTS: fds.frombytes(cmsg_data[:len(cmsg_data) - (len(cmsg_data) % fds.itemsize)]) + body = raw[_HEADER_SIZE:_HEADER_SIZE + length] response = json.loads(body) # If an fd arrived, wrap it in a file object @@ -155,16 +171,6 @@ def _send_and_recv(self, request: dict) -> dict: return response - def _recvall(self, n: int) -> bytes: - """Read exactly n bytes from the socket.""" - data = bytearray() - while len(data) < n: - chunk = self.sock.recv(n - len(data)) - if not chunk: - raise ConnectionError("Worker closed connection") - data.extend(chunk) - return bytes(data) - async def shutdown(self, timeout: float = 5.0): """Ask the worker to shut down gracefully, then force-kill if needed.""" if not self.is_alive: diff --git a/tests/test_worker.py b/tests/test_worker.py new file mode 100644 index 00000000..f4f75a2a --- /dev/null +++ b/tests/test_worker.py @@ -0,0 +1,532 @@ +"""Tests for the per-user persistent worker infrastructure. + +Tests the IPC protocol (length-prefixed JSON, SCM_RIGHTS fd passing), +worker lifecycle (spawn, execute, shutdown, crash recovery), +and the in-process dev-mode fallback. +""" + +import asyncio +import json +import os +import socket +import struct +import tempfile +import time + +import pytest + +from fileglancer.user_worker import ( + _send, + _send_with_fd, + _recv, + _ACTIONS, + WorkerContext, + _HEADER_FMT, + _HEADER_SIZE, +) +from fileglancer.worker_pool import ( + UserWorker, + WorkerPool, + WorkerError, + WorkerDead, +) + + +# --------------------------------------------------------------------------- +# IPC protocol tests (user_worker.py _send/_recv/_send_with_fd) +# --------------------------------------------------------------------------- + +class TestIPCProtocol: + """Test the length-prefixed JSON wire protocol.""" + + def test_send_recv_roundtrip(self): + """A message sent with _send can be read back with _recv.""" + a, b = socket.socketpair() + try: + msg = {"action": "test", "value": 42, "nested": {"key": "val"}} + _send(a, msg) + result = _recv(b) + assert result == msg + finally: + a.close() + b.close() + + def test_send_recv_empty_dict(self): + """Empty dicts round-trip correctly.""" + a, b = socket.socketpair() + try: + _send(a, {}) + assert _recv(b) == {} + finally: + a.close() + b.close() + + def test_send_recv_large_message(self): + """Messages larger than a single recv buffer work.""" + a, b = socket.socketpair() + try: + # Create a message larger than typical socket buffer + big_value = "x" * 100_000 + msg = {"data": big_value} + _send(a, msg) + result = _recv(b) + assert result["data"] == big_value + finally: + a.close() + b.close() + + def test_send_recv_multiple_messages(self): + """Multiple sequential messages on the same socket.""" + a, b = socket.socketpair() + try: + for i in range(10): + _send(a, {"seq": i}) + for i in range(10): + result = _recv(b) + assert result == {"seq": i} + finally: + a.close() + b.close() + + def test_recv_connection_closed(self): + """_recv raises ConnectionError when the peer closes the socket.""" + a, b = socket.socketpair() + a.close() + with pytest.raises(ConnectionError): + _recv(b) + b.close() + + def test_send_with_fd_passes_file_descriptor(self): + """_send_with_fd sends a file descriptor via SCM_RIGHTS.""" + import array + + a, b = socket.socketpair() + try: + # Create a temp file and send its fd + with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: + f.write("hello from fd passing") + temp_path = f.name + + fd_to_send = os.open(temp_path, os.O_RDONLY) + try: + msg = {"type": "handle", "size": 21} + _send_with_fd(a, msg, fd_to_send) + + # Receive using recvmsg for EVERYTHING (header + payload + ancillary) + # The fd arrives with the first bytes, so we must use recvmsg from the start + fds = array.array("i") + raw = b"" + total_header = _HEADER_SIZE + while len(raw) < total_header: + data, ancdata, flags, addr = b.recvmsg( + 4096, + socket.CMSG_LEN(struct.calcsize("i")), + ) + raw += data + for cmsg_level, cmsg_type, cmsg_data in ancdata: + if cmsg_level == socket.SOL_SOCKET and cmsg_type == socket.SCM_RIGHTS: + fds.frombytes(cmsg_data[:len(cmsg_data) - (len(cmsg_data) % fds.itemsize)]) + + (length,) = struct.unpack(_HEADER_FMT, raw[:_HEADER_SIZE]) + total_needed = _HEADER_SIZE + length + while len(raw) < total_needed: + data, ancdata, flags, addr = b.recvmsg( + total_needed - len(raw), + socket.CMSG_LEN(struct.calcsize("i")), + ) + raw += data + for cmsg_level, cmsg_type, cmsg_data in ancdata: + if cmsg_level == socket.SOL_SOCKET and cmsg_type == socket.SCM_RIGHTS: + fds.frombytes(cmsg_data[:len(cmsg_data) - (len(cmsg_data) % fds.itemsize)]) + + payload = raw[_HEADER_SIZE:_HEADER_SIZE + length] + result = json.loads(payload) + assert result == {"type": "handle", "size": 21} + assert len(fds) == 1 + + # Read from the received fd + received_fd = fds[0] + with os.fdopen(received_fd, 'r') as f: + content = f.read() + assert content == "hello from fd passing" + finally: + os.close(fd_to_send) + os.unlink(temp_path) + finally: + a.close() + b.close() + + +# --------------------------------------------------------------------------- +# UserWorker IPC integration tests (worker_pool.py _send_and_recv) +# --------------------------------------------------------------------------- + +class TestUserWorkerIPC: + """Test UserWorker's _send_and_recv with a mock worker on the other end.""" + + def _make_worker_pair(self): + """Create a UserWorker connected to a mock 'worker' socket.""" + parent, child = socket.socketpair() + parent.setblocking(True) + + # Create a fake Popen-like object + class FakeProcess: + returncode = None + def poll(self): return None + def wait(self): pass + def kill(self): pass + + worker = UserWorker("testuser", FakeProcess(), parent) + return worker, child + + def test_send_and_recv_basic(self): + """Basic request/response over the socket.""" + worker, child = self._make_worker_pair() + try: + # Simulate worker: read request, send response + def mock_worker(): + req = _recv(child) + assert req["action"] == "ping" + _send(child, {"status": "pong"}) + + import threading + t = threading.Thread(target=mock_worker) + t.start() + + result = worker._send_and_recv({"action": "ping"}) + assert result == {"status": "pong"} + t.join() + finally: + worker.sock.close() + child.close() + + def test_send_and_recv_with_fd(self): + """Response with SCM_RIGHTS fd is auto-wrapped in _file_handle.""" + worker, child = self._make_worker_pair() + try: + with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: + f.write("fd test content") + temp_path = f.name + + def mock_worker(): + req = _recv(child) + fd = os.open(temp_path, os.O_RDONLY) + _send_with_fd(child, {"type": "handle", "size": 15}, fd) + os.close(fd) + + import threading + t = threading.Thread(target=mock_worker) + t.start() + + result = worker._send_and_recv({"action": "open_file"}) + assert result["type"] == "handle" + assert "_file_handle" in result + + fh = result["_file_handle"] + content = fh.read().decode() + fh.close() + assert content == "fd test content" + + t.join() + os.unlink(temp_path) + finally: + worker.sock.close() + child.close() + + def test_send_and_recv_no_fd(self): + """Normal response without fd has no _file_handle key.""" + worker, child = self._make_worker_pair() + try: + def mock_worker(): + _recv(child) + _send(child, {"files": [1, 2, 3]}) + + import threading + t = threading.Thread(target=mock_worker) + t.start() + + result = worker._send_and_recv({"action": "list_dir"}) + assert result == {"files": [1, 2, 3]} + assert "_file_handle" not in result + t.join() + finally: + worker.sock.close() + child.close() + + +# --------------------------------------------------------------------------- +# UserWorker async execute tests +# --------------------------------------------------------------------------- + +class TestUserWorkerExecute: + """Test the async execute() method.""" + + def _make_worker_pair(self): + parent, child = socket.socketpair() + parent.setblocking(True) + + class FakeProcess: + returncode = None + def poll(self): return None + def wait(self): pass + def kill(self): pass + + worker = UserWorker("testuser", FakeProcess(), parent) + return worker, child + + @pytest.mark.asyncio + async def test_execute_success(self): + worker, child = self._make_worker_pair() + try: + import threading + def mock_worker(): + _recv(child) + _send(child, {"result": "ok"}) + + t = threading.Thread(target=mock_worker) + t.start() + + result = await worker.execute("test_action") + assert result == {"result": "ok"} + t.join() + finally: + worker.sock.close() + child.close() + + @pytest.mark.asyncio + async def test_execute_worker_error(self): + worker, child = self._make_worker_pair() + try: + import threading + def mock_worker(): + _recv(child) + _send(child, {"error": "something broke"}) + + t = threading.Thread(target=mock_worker) + t.start() + + with pytest.raises(WorkerError, match="something broke"): + await worker.execute("bad_action") + t.join() + finally: + worker.sock.close() + child.close() + + @pytest.mark.asyncio + async def test_execute_dead_worker(self): + parent, child = socket.socketpair() + parent.setblocking(True) + child.close() + + class DeadProcess: + returncode = 1 + def poll(self): return 1 + def wait(self): pass + def kill(self): pass + + worker = UserWorker("testuser", DeadProcess(), parent) + with pytest.raises(WorkerDead): + await worker.execute("anything") + parent.close() + + @pytest.mark.asyncio + async def test_execute_with_fd_transparent(self): + """execute() transparently includes _file_handle when worker sends fd.""" + worker, child = self._make_worker_pair() + try: + with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: + f.write("transparent fd") + temp_path = f.name + + import threading + def mock_worker(): + _recv(child) + fd = os.open(temp_path, os.O_RDONLY) + _send_with_fd(child, {"content_type": "text/plain"}, fd) + os.close(fd) + + t = threading.Thread(target=mock_worker) + t.start() + + result = await worker.execute("open_file") + assert result["content_type"] == "text/plain" + assert "_file_handle" in result + + fh = result["_file_handle"] + assert fh.read().decode() == "transparent fd" + fh.close() + + t.join() + os.unlink(temp_path) + finally: + worker.sock.close() + child.close() + + +# --------------------------------------------------------------------------- +# Action handler tests (user_worker.py actions run in-process) +# --------------------------------------------------------------------------- + +class TestActionHandlers: + """Test action handlers directly (simulates dev/test mode).""" + + @pytest.fixture + def temp_dir(self): + d = tempfile.mkdtemp() + # Create test files + with open(os.path.join(d, "hello.txt"), "w") as f: + f.write("hello world") + os.makedirs(os.path.join(d, "subdir")) + with open(os.path.join(d, "subdir", "nested.txt"), "w") as f: + f.write("nested content") + yield d + import shutil + shutil.rmtree(d) + + @pytest.fixture + def ctx(self, temp_dir): + """Create a WorkerContext with a test database.""" + from fileglancer.settings import get_settings + settings = get_settings() + return WorkerContext(username=os.environ.get("USER", "test"), db_url=settings.db_url) + + def test_get_profile(self, ctx): + handler = _ACTIONS["get_profile"] + result = handler({"action": "get_profile"}, ctx) + assert "username" in result + assert "groups" in result + assert isinstance(result["groups"], list) + + def test_unknown_action(self): + """Unknown actions are not in the registry.""" + assert "nonexistent_action" not in _ACTIONS + + def test_validate_paths_empty(self, ctx): + handler = _ACTIONS["validate_paths"] + result = handler({"action": "validate_paths", "paths": {}}, ctx) + assert result == {"errors": {}} + + +# --------------------------------------------------------------------------- +# Worker main loop integration test +# --------------------------------------------------------------------------- + +class TestWorkerMainLoop: + """Test the worker subprocess main loop via socketpair (no actual subprocess).""" + + def _run_worker_loop(self, child_sock): + """Run the worker main loop in a thread using the given socket.""" + import threading + + def target(): + # Simulate what main() does, but with our socket + sock = child_sock + uid = os.getuid() + try: + username = os.environ.get("USER", str(uid)) + except KeyError: + username = str(uid) + + from fileglancer.settings import get_settings + settings = get_settings() + ctx = WorkerContext(username=username, db_url=settings.db_url) + + while True: + try: + request = _recv(sock) + except ConnectionError: + break + + action = request.get("action") + if action == "shutdown": + break + + handler = _ACTIONS.get(action) + if handler is None: + _send(sock, {"error": f"Unknown action: {action}"}) + continue + + try: + result = handler(request, ctx) + fd = result.pop("_fd", None) + file_handle = result.pop("_file_handle", None) + if fd is not None: + _send_with_fd(sock, result, fd) + if file_handle is not None: + file_handle.close() + else: + _send(sock, result) + except Exception as e: + _send(sock, {"error": str(e)}) + + sock.close() + + t = threading.Thread(target=target, daemon=True) + t.start() + return t + + def test_shutdown_message(self): + """Worker exits cleanly on shutdown message.""" + parent, child = socket.socketpair() + t = self._run_worker_loop(child) + + _send(parent, {"action": "shutdown"}) + t.join(timeout=5) + assert not t.is_alive() + parent.close() + + def test_unknown_action_returns_error(self): + """Worker returns error for unknown actions.""" + parent, child = socket.socketpair() + t = self._run_worker_loop(child) + + _send(parent, {"action": "totally_fake"}) + result = _recv(parent) + assert "error" in result + assert "Unknown action" in result["error"] + + _send(parent, {"action": "shutdown"}) + t.join(timeout=5) + parent.close() + + def test_get_profile_via_loop(self): + """End-to-end: send get_profile through the worker loop.""" + parent, child = socket.socketpair() + t = self._run_worker_loop(child) + + _send(parent, {"action": "get_profile"}) + result = _recv(parent) + assert "username" in result + assert "groups" in result + + _send(parent, {"action": "shutdown"}) + t.join(timeout=5) + parent.close() + + def test_multiple_requests(self): + """Worker handles multiple sequential requests.""" + parent, child = socket.socketpair() + t = self._run_worker_loop(child) + + # Send several requests + _send(parent, {"action": "get_profile"}) + r1 = _recv(parent) + assert "username" in r1 + + _send(parent, {"action": "validate_paths", "paths": {}}) + r2 = _recv(parent) + assert r2 == {"errors": {}} + + _send(parent, {"action": "shutdown"}) + t.join(timeout=5) + parent.close() + + def test_connection_close_exits_loop(self): + """Worker exits when parent closes the socket.""" + parent, child = socket.socketpair() + t = self._run_worker_loop(child) + + # Close without sending shutdown — worker should detect and exit + parent.close() + t.join(timeout=5) + assert not t.is_alive() From 6d7ffa47a318e5193c1d3c92e741f9e0cd706bf5 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 11:42:43 -0400 Subject: [PATCH 05/39] fixed recvmsg bug --- fileglancer/server.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fileglancer/server.py b/fileglancer/server.py index 0ec707a2..95f54145 100644 --- a/fileglancer/server.py +++ b/fileglancer/server.py @@ -246,7 +246,7 @@ async def _worker_exec(username: str, action: str, **kwargs): logger.error(f"Worker dead for {username}: {e}") raise HTTPException(status_code=503, detail="Service temporarily unavailable") except WorkerError as e: - raise # Let caller handle application-level errors + raise HTTPException(status_code=500, detail=str(e)) else: # Dev/test mode: run action directly in-process from fileglancer.user_worker import _ACTIONS, WorkerContext @@ -1799,8 +1799,10 @@ async def get_job_file(job_id: int, if content is None: raise HTTPException(status_code=404, detail=f"File not found: {file_type}") return PlainTextResponse(content) - except WorkerError as e: - raise HTTPException(status_code=404, detail=str(e)) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) def _ensure_utc(dt: Optional[datetime]) -> Optional[datetime]: """Re-attach UTC timezone to naive datetimes from the DB. From 3b6b455e709be4df10d00d8400715d6207b66034 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 11:44:57 -0400 Subject: [PATCH 06/39] add lock so workers only handle one request at a time --- fileglancer/worker_pool.py | 46 +++++++++++++++++++++----------------- tests/test_worker.py | 41 +++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 20 deletions(-) diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index 19e198b3..3381f8f6 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -73,6 +73,7 @@ def __init__(self, username: str, process: subprocess.Popen, self.sock = sock self.last_activity = time.monotonic() self._busy = False + self._lock = asyncio.Lock() # serialize requests to the worker @property def is_alive(self) -> bool: @@ -88,33 +89,38 @@ async def execute(self, action: str, **kwargs) -> Any: If the worker sends a file descriptor (SCM_RIGHTS), the response dict will contain a ``_file_handle`` key with an open file object. + Requests are serialized per-worker via an asyncio lock — the worker + subprocess handles one request at a time, so concurrent callers + must not interleave their sends/receives on the shared socket. + Raises WorkerError on application-level errors from the worker. Raises WorkerDead if the subprocess has exited. """ if not self.is_alive: raise WorkerDead(f"Worker for {self.username} is dead (rc={self.process.returncode})") - self._busy = True - self.last_activity = time.monotonic() - try: - request = {"action": action, **kwargs} - loop = asyncio.get_event_loop() - response = await loop.run_in_executor( - None, self._send_and_recv, request) - - if response.get("error"): - # Close any fd that arrived with an error response - fh = response.pop("_file_handle", None) - if fh is not None: - fh.close() - raise WorkerError(response["error"]) - - return response - except (BrokenPipeError, ConnectionResetError, OSError) as e: - raise WorkerDead(f"Worker for {self.username} connection lost: {e}") from e - finally: - self._busy = False + async with self._lock: + self._busy = True self.last_activity = time.monotonic() + try: + request = {"action": action, **kwargs} + loop = asyncio.get_event_loop() + response = await loop.run_in_executor( + None, self._send_and_recv, request) + + if response.get("error"): + # Close any fd that arrived with an error response + fh = response.pop("_file_handle", None) + if fh is not None: + fh.close() + raise WorkerError(response["error"]) + + return response + except (BrokenPipeError, ConnectionResetError, OSError) as e: + raise WorkerDead(f"Worker for {self.username} connection lost: {e}") from e + finally: + self._busy = False + self.last_activity = time.monotonic() def _send_and_recv(self, request: dict) -> dict: """Send a request and receive the response (blocking, runs in thread). diff --git a/tests/test_worker.py b/tests/test_worker.py index f4f75a2a..214e9ce2 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -363,6 +363,47 @@ def mock_worker(): child.close() + @pytest.mark.asyncio + async def test_concurrent_execute_serialized(self): + """Concurrent execute() calls are serialized — responses never get swapped.""" + worker, child = self._make_worker_pair() + try: + import threading + + def mock_worker(): + """Echo worker: returns the action name in the response.""" + for _ in range(20): + try: + req = _recv(child) + except ConnectionError: + break + action = req.get("action", "unknown") + if action == "shutdown": + break + # Simulate some work + time.sleep(0.01) + _send(child, {"action_echo": action, "seq": req.get("seq")}) + + t = threading.Thread(target=mock_worker, daemon=True) + t.start() + + # Fire 10 concurrent requests with different actions + async def make_request(seq): + action = f"action_{seq}" + result = await worker.execute(action, seq=seq) + # Verify we got OUR response back, not someone else's + assert result["action_echo"] == action + assert result["seq"] == seq + + await asyncio.gather(*[make_request(i) for i in range(10)]) + + _send(child, {"action": "shutdown"}) # won't be read, but close cleanly + t.join(timeout=5) + finally: + worker.sock.close() + child.close() + + # --------------------------------------------------------------------------- # Action handler tests (user_worker.py actions run in-process) # --------------------------------------------------------------------------- From 942dbcac43eaa9d40dcc07cb4402bee7954429af Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 11:46:29 -0400 Subject: [PATCH 07/39] show log when worker triggers 500 error --- fileglancer/server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fileglancer/server.py b/fileglancer/server.py index 95f54145..ad5e63e8 100644 --- a/fileglancer/server.py +++ b/fileglancer/server.py @@ -246,6 +246,7 @@ async def _worker_exec(username: str, action: str, **kwargs): logger.error(f"Worker dead for {username}: {e}") raise HTTPException(status_code=503, detail="Service temporarily unavailable") except WorkerError as e: + logger.error(f"Worker error for {username} action={action}: {e}") raise HTTPException(status_code=500, detail=str(e)) else: # Dev/test mode: run action directly in-process From 0dd3e0962a4ae744df595624c182112e57733eca Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 11:49:50 -0400 Subject: [PATCH 08/39] worker error handling --- fileglancer/server.py | 5 +++-- fileglancer/user_worker.py | 4 ++-- fileglancer/worker_pool.py | 9 +++++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/fileglancer/server.py b/fileglancer/server.py index ad5e63e8..13640c55 100644 --- a/fileglancer/server.py +++ b/fileglancer/server.py @@ -246,8 +246,9 @@ async def _worker_exec(username: str, action: str, **kwargs): logger.error(f"Worker dead for {username}: {e}") raise HTTPException(status_code=503, detail="Service temporarily unavailable") except WorkerError as e: - logger.error(f"Worker error for {username} action={action}: {e}") - raise HTTPException(status_code=500, detail=str(e)) + if e.status_code >= 500: + logger.error(f"Worker error for {username} action={action}: {e}") + raise HTTPException(status_code=e.status_code, detail=str(e)) else: # Dev/test mode: run action directly in-process from fileglancer.user_worker import _ACTIONS, WorkerContext diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index db77c095..56802b86 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -320,9 +320,9 @@ def _action_open_file(request: dict, ctx: WorkerContext) -> dict: return {"redirect": True, "fsp_name": fsp.name, "subpath": relative_subpath or ""} return {"error": str(e), "status_code": 400} except FileNotFoundError: - return {"error": "File or directory not found", "status_code": 404} + return {"error": f"File or directory not found: {fsp_name}/{subpath}", "status_code": 404} except PermissionError: - return {"error": "Permission denied", "status_code": 403} + return {"error": f"Permission denied: {fsp_name}/{subpath}", "status_code": 403} def _action_head_file(request: dict, ctx: WorkerContext) -> dict: diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index 3381f8f6..7c95a734 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -49,7 +49,9 @@ class WorkerError(Exception): """Raised when a worker returns an error response.""" - pass + def __init__(self, message: str, status_code: int = 500): + super().__init__(message) + self.status_code = status_code class WorkerDead(Exception): @@ -113,7 +115,10 @@ async def execute(self, action: str, **kwargs) -> Any: fh = response.pop("_file_handle", None) if fh is not None: fh.close() - raise WorkerError(response["error"]) + raise WorkerError( + response["error"], + status_code=response.get("status_code", 500), + ) return response except (BrokenPipeError, ConnectionResetError, OSError) as e: From b613c6db4c6e91fb8f5e8a6ac75f40d3411c58a9 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 12:21:07 -0400 Subject: [PATCH 09/39] enforce max_workers cap when all workers are busy When the pool is at capacity and all workers are busy, _evict_lru() returns without evicting. Previously, get_worker() would unconditionally spawn a new worker anyway, silently exceeding the limit. Now it raises a WorkerError(503) so the caller gets a proper "try again later" instead of unbounded subprocess growth. Co-Authored-By: Claude Opus 4.6 (1M context) --- fileglancer/worker_pool.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index 7c95a734..32564ce5 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -254,6 +254,10 @@ async def get_worker(self, username: str) -> UserWorker: if len(self._workers) >= self.max_workers: await self._evict_lru() + # If still at capacity (all workers busy), refuse rather than exceed the limit + if len(self._workers) >= self.max_workers: + raise WorkerError("Worker pool at capacity, try again later", status_code=503) + # Spawn new worker new_worker = await self._spawn_worker(username) self._workers[username] = new_worker From 5fc0bdf07fdf121721a83c5bc709dae35b81f3f4 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 12:21:31 -0400 Subject: [PATCH 10/39] use settings for worker pool max_workers and idle_timeout WorkerPool.__init__ was hardcoding max_workers=50 and idle_timeout=300 instead of reading from Settings.worker_pool_max_workers and worker_pool_idle_timeout. The eviction loop sleep is now also capped at the idle_timeout so shorter timeouts are honored promptly. Co-Authored-By: Claude Opus 4.6 (1M context) --- fileglancer/worker_pool.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index 32564ce5..ff607ebe 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -221,8 +221,8 @@ def __init__(self, settings: Settings): self._workers: dict[str, UserWorker] = {} self._locks: dict[str, asyncio.Lock] = {} self._eviction_task: Optional[asyncio.Task] = None - self.max_workers = 50 - self.idle_timeout = 300 # seconds + self.max_workers = settings.worker_pool_max_workers + self.idle_timeout = settings.worker_pool_idle_timeout def _get_lock(self, username: str) -> asyncio.Lock: if username not in self._locks: @@ -351,7 +351,7 @@ async def start_eviction_loop(self): async def _eviction_loop(self): """Periodically evict idle workers.""" while True: - await asyncio.sleep(60) + await asyncio.sleep(min(60, self.idle_timeout)) now = time.monotonic() to_evict = [] for name, worker in list(self._workers.items()): From 8841c9d928d030613598dd61d7da613ae2db6307 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 12:21:58 -0400 Subject: [PATCH 11/39] fix file descriptor leaks in _send_and_recv Two fd leak paths: (1) if an exception occurs mid-receive, any fds already collected from SCM_RIGHTS ancillary data were never closed; (2) if multiple fds arrived (unlikely but possible), only fds[0] was wrapped and the rest leaked. Now all fds are closed on error, and extra fds beyond the first are closed on success. Co-Authored-By: Claude Opus 4.6 (1M context) --- fileglancer/worker_pool.py | 84 ++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index ff607ebe..1690f2a5 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -142,43 +142,57 @@ def _send_and_recv(self, request: dict) -> dict: # Receive header + payload + optional fd, all via recvmsg fds = array.array("i") raw = b"" - # First, read at least the header - while len(raw) < _HEADER_SIZE: - msg, ancdata, flags, addr = self.sock.recvmsg( - max(_HEADER_SIZE - len(raw), 4096), - socket.CMSG_LEN(struct.calcsize("i")), - ) - if not msg: - raise ConnectionError("Worker closed connection") - raw += msg - for cmsg_level, cmsg_type, cmsg_data in ancdata: - if cmsg_level == socket.SOL_SOCKET and cmsg_type == socket.SCM_RIGHTS: - fds.frombytes(cmsg_data[:len(cmsg_data) - (len(cmsg_data) % fds.itemsize)]) - - (length,) = struct.unpack(_HEADER_FMT, raw[:_HEADER_SIZE]) - if length > _MAX_MESSAGE_SIZE: - raise WorkerError(f"Response too large: {length} bytes") - - # We may have read some payload bytes with the header - total_needed = _HEADER_SIZE + length - while len(raw) < total_needed: - msg, ancdata, flags, addr = self.sock.recvmsg( - total_needed - len(raw), - socket.CMSG_LEN(struct.calcsize("i")), - ) - if not msg: - raise ConnectionError("Worker closed connection mid-message") - raw += msg - for cmsg_level, cmsg_type, cmsg_data in ancdata: - if cmsg_level == socket.SOL_SOCKET and cmsg_type == socket.SCM_RIGHTS: - fds.frombytes(cmsg_data[:len(cmsg_data) - (len(cmsg_data) % fds.itemsize)]) - - body = raw[_HEADER_SIZE:_HEADER_SIZE + length] - response = json.loads(body) - - # If an fd arrived, wrap it in a file object + try: + # First, read at least the header + while len(raw) < _HEADER_SIZE: + msg, ancdata, flags, addr = self.sock.recvmsg( + max(_HEADER_SIZE - len(raw), 4096), + socket.CMSG_LEN(struct.calcsize("i")), + ) + if not msg: + raise ConnectionError("Worker closed connection") + raw += msg + for cmsg_level, cmsg_type, cmsg_data in ancdata: + if cmsg_level == socket.SOL_SOCKET and cmsg_type == socket.SCM_RIGHTS: + fds.frombytes(cmsg_data[:len(cmsg_data) - (len(cmsg_data) % fds.itemsize)]) + + (length,) = struct.unpack(_HEADER_FMT, raw[:_HEADER_SIZE]) + if length > _MAX_MESSAGE_SIZE: + raise WorkerError(f"Response too large: {length} bytes") + + # We may have read some payload bytes with the header + total_needed = _HEADER_SIZE + length + while len(raw) < total_needed: + msg, ancdata, flags, addr = self.sock.recvmsg( + total_needed - len(raw), + socket.CMSG_LEN(struct.calcsize("i")), + ) + if not msg: + raise ConnectionError("Worker closed connection mid-message") + raw += msg + for cmsg_level, cmsg_type, cmsg_data in ancdata: + if cmsg_level == socket.SOL_SOCKET and cmsg_type == socket.SCM_RIGHTS: + fds.frombytes(cmsg_data[:len(cmsg_data) - (len(cmsg_data) % fds.itemsize)]) + + body = raw[_HEADER_SIZE:_HEADER_SIZE + length] + response = json.loads(body) + except Exception: + # Close any fds received before the error to prevent leaks + for fd_val in fds: + try: + os.close(fd_val) + except OSError: + pass + raise + + # If an fd arrived, wrap it in a file object and close any extras if fds: response["_file_handle"] = os.fdopen(fds[0], "rb") + for extra_fd in fds[1:]: + try: + os.close(extra_fd) + except OSError: + pass return response From 18be1aee6415bbaf89a7c7ce7bc55bd74eb94f3a Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 12:22:19 -0400 Subject: [PATCH 12/39] add error handling to _action_get_file_info and _action_check_binary These handlers were missing try/except for FileNotFoundError and PermissionError, unlike all neighboring file action handlers. Exceptions would propagate to the worker main loop's generic handler and return 500 instead of proper 404/403. Also add status_code to _get_filestore error responses for consistency with other handlers. Co-Authored-By: Claude Opus 4.6 (1M context) --- fileglancer/user_worker.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index 56802b86..39ff7ae6 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -253,13 +253,18 @@ def _action_get_file_info(request: dict, ctx: WorkerContext) -> dict: filestore, error = _get_filestore(fsp_name, ctx.db_url) if filestore is None: - return {"error": error} + return {"error": error, "status_code": 404 if "not found" in error else 500} from fileglancer import database as db - with db.get_db_session(ctx.db_url) as session: - file_info = filestore.get_file_info(subpath, current_user=ctx.username, session=session) - return {"info": json.loads(file_info.model_dump_json())} + try: + with db.get_db_session(ctx.db_url) as session: + file_info = filestore.get_file_info(subpath, current_user=ctx.username, session=session) + return {"info": json.loads(file_info.model_dump_json())} + except FileNotFoundError: + return {"error": "File or directory not found", "status_code": 404} + except PermissionError: + return {"error": "Permission denied", "status_code": 403} def _action_check_binary(request: dict, ctx: WorkerContext) -> dict: @@ -269,10 +274,15 @@ def _action_check_binary(request: dict, ctx: WorkerContext) -> dict: filestore, error = _get_filestore(fsp_name, ctx.db_url) if filestore is None: - return {"error": error} + return {"error": error, "status_code": 404 if "not found" in error else 500} - is_binary = filestore.check_is_binary(subpath) - return {"is_binary": is_binary} + try: + is_binary = filestore.check_is_binary(subpath) + return {"is_binary": is_binary} + except FileNotFoundError: + return {"error": "File or directory not found", "status_code": 404} + except PermissionError: + return {"error": "Permission denied", "status_code": 403} def _action_open_file(request: dict, ctx: WorkerContext) -> dict: From aa73390f09fd7207fa47916a5b91b457f1c44cc6 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 12:22:49 -0400 Subject: [PATCH 13/39] use os.getgrouplist() for worker supplementary groups _spawn_worker() was using grp.getgrall() to build the child process's supplementary groups. On LDAP/NSS-backed systems, getgrall() may not enumerate all effective groups, causing workers to lose access to files that the user can normally reach. os.getgrouplist() queries NSS directly and matches the approach used by user_context.py. Co-Authored-By: Claude Opus 4.6 (1M context) --- fileglancer/worker_pool.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index 1690f2a5..7a66beb1 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -25,7 +25,6 @@ import array import asyncio -import grp import json import os import pwd @@ -284,9 +283,7 @@ async def _spawn_worker(self, username: str) -> UserWorker: # Build identity kwargs (only switch if running as root) identity_kwargs: dict = {} if os.geteuid() == 0: - groups = [g.gr_gid for g in grp.getgrall() if username in g.gr_mem] - if pw.pw_gid not in groups: - groups.append(pw.pw_gid) + groups = os.getgrouplist(username, pw.pw_gid) identity_kwargs = { "user": pw.pw_uid, "group": pw.pw_gid, From f351e7db5f0730d26cd0daf850396141430c1878 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 12:23:02 -0400 Subject: [PATCH 14/39] replace bare except with except Exception in head_object Bare except: catches SystemExit and KeyboardInterrupt, preventing clean shutdown. Use except Exception: instead. Co-Authored-By: Claude Opus 4.6 (1M context) --- fileglancer/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fileglancer/server.py b/fileglancer/server.py index 13640c55..c3c948a3 100644 --- a/fileglancer/server.py +++ b/fileglancer/server.py @@ -1198,7 +1198,7 @@ async def head_object(sharing_key: str, path: str = ''): target_name=info["target_name"], path=subpath) return Response(headers=result.get("headers", {}), status_code=result.get("status_code", 200)) - except: + except Exception: logger.opt(exception=sys.exc_info()).info("Error requesting head") return get_error_response(500, "InternalError", "Error requesting HEAD", path) From 5d678f854b139ddb0dc9ea696e306449c046ecd5 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 12:23:30 -0400 Subject: [PATCH 15/39] use model_dump(mode='json') instead of json.loads(model_dump_json()) The json.loads(model.model_dump_json()) pattern serializes to a JSON string then immediately parses it back to a dict. model_dump(mode='json') produces the same dict directly without the string round-trip, which matters for directory listings with many files. Co-Authored-By: Claude Opus 4.6 (1M context) --- fileglancer/user_worker.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index 39ff7ae6..def72134 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -163,12 +163,12 @@ def _action_list_dir(request: dict, ctx: WorkerContext) -> dict: try: with db.get_db_session(ctx.db_url) as session: file_info = filestore.get_file_info(subpath, current_user=current_user, session=session) - result = {"info": json.loads(file_info.model_dump_json())} + result = {"info": file_info.model_dump(mode="json")} if file_info.is_dir: try: files = list(filestore.yield_file_infos(subpath, current_user=current_user, session=session)) - result["files"] = [json.loads(f.model_dump_json()) for f in files] + result["files"] = [f.model_dump(mode="json") for f in files] except PermissionError: result["files"] = [] result["error"] = "Permission denied when listing directory contents" @@ -211,7 +211,7 @@ def _action_list_dir_paged(request: dict, ctx: WorkerContext) -> dict: try: with db.get_db_session(ctx.db_url) as session: file_info = filestore.get_file_info(subpath, current_user=current_user, session=session) - result = {"info": json.loads(file_info.model_dump_json())} + result = {"info": file_info.model_dump(mode="json")} if file_info.is_dir: try: @@ -219,7 +219,7 @@ def _action_list_dir_paged(request: dict, ctx: WorkerContext) -> dict: subpath, current_user=current_user, session=session, limit=limit, cursor=cursor ) - result["files"] = [json.loads(f.model_dump_json()) for f in files] + result["files"] = [f.model_dump(mode="json") for f in files] result["has_more"] = has_more result["next_cursor"] = next_cursor result["total_count"] = total_count @@ -260,7 +260,7 @@ def _action_get_file_info(request: dict, ctx: WorkerContext) -> dict: try: with db.get_db_session(ctx.db_url) as session: file_info = filestore.get_file_info(subpath, current_user=ctx.username, session=session) - return {"info": json.loads(file_info.model_dump_json())} + return {"info": file_info.model_dump(mode="json")} except FileNotFoundError: return {"error": "File or directory not found", "status_code": 404} except PermissionError: @@ -354,7 +354,7 @@ def _action_head_file(request: dict, ctx: WorkerContext) -> dict: is_binary = filestore.check_is_binary(subpath) if not file_info.is_dir else False return { - "info": json.loads(file_info.model_dump_json()), + "info": file_info.model_dump(mode="json"), "content_type": content_type, "is_binary": is_binary, } @@ -477,7 +477,7 @@ def _action_update_file(request: dict, ctx: WorkerContext) -> dict: try: old_file_info = filestore.get_file_info(subpath, ctx.username) - result = {"info": json.loads(old_file_info.model_dump_json())} + result = {"info": old_file_info.model_dump(mode="json")} if new_permissions is not None and new_permissions != old_file_info.permissions: filestore.change_file_permissions(subpath, new_permissions) From 6b66a5a260a935147fc3a0f17298660904165d61 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 12:23:51 -0400 Subject: [PATCH 16/39] add error handling for dev-mode worker action dispatch In dev/test mode (no worker pool), exceptions from action handlers would propagate unhandled to the generic exception handler. In production, the worker subprocess catches these and returns error dicts. Now dev mode also catches and logs action handler exceptions, matching the production error behavior more closely. Co-Authored-By: Claude Opus 4.6 (1M context) --- fileglancer/server.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fileglancer/server.py b/fileglancer/server.py index c3c948a3..d0a27500 100644 --- a/fileglancer/server.py +++ b/fileglancer/server.py @@ -257,7 +257,11 @@ async def _worker_exec(username: str, action: str, **kwargs): raise HTTPException(status_code=500, detail=f"Unknown action: {action}") ctx = WorkerContext(username=username, db_url=settings.db_url) request = {"action": action, **kwargs} - result = handler(request, ctx) + try: + result = handler(request, ctx) + except Exception as e: + logger.error(f"Action handler error for {username} action={action}: {e}") + raise HTTPException(status_code=500, detail=str(e)) # Strip the raw fd (not meaningful in-process), keep _file_handle result.pop("_fd", None) return result From 7776d6aa773357a2deb0dbfd4c18b012ea5ca658 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 12:24:13 -0400 Subject: [PATCH 17/39] use ctx.db_url consistently in job file action handlers _action_get_job_file_paths and _action_get_service_url were calling get_settings() to get the db_url while every other handler uses ctx.db_url. Use ctx.db_url for consistency with the rest of the worker action handlers. Co-Authored-By: Claude Opus 4.6 (1M context) --- fileglancer/user_worker.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index def72134..d70e8295 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -602,12 +602,10 @@ def _action_get_job_file_paths(request: dict, ctx: WorkerContext) -> dict: """Get job file path info.""" from fileglancer.apps.core import get_job_file_paths from fileglancer import database as db - from fileglancer.settings import get_settings - settings = get_settings() job_id = request["job_id"] - with db.get_db_session(settings.db_url) as session: + with db.get_db_session(ctx.db_url) as session: db_job = db.get_job(session, job_id, ctx.username) if db_job is None: return {"error": f"Job {job_id} not found", "status_code": 404} @@ -619,12 +617,10 @@ def _action_get_service_url(request: dict, ctx: WorkerContext) -> dict: """Read service URL from job work directory.""" from fileglancer.apps.core import get_service_url from fileglancer import database as db - from fileglancer.settings import get_settings - settings = get_settings() job_id = request["job_id"] - with db.get_db_session(settings.db_url) as session: + with db.get_db_session(ctx.db_url) as session: db_job = db.get_job(session, job_id, ctx.username) if db_job is None: return {"error": f"Job {job_id} not found", "status_code": 404} From 626da3d3befe60b634d1e804385b8afa39574dfb Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sun, 12 Apr 2026 12:24:27 -0400 Subject: [PATCH 18/39] add 120s socket timeout to worker IPC The worker socket had no timeout, so if a worker hung mid-response the run_in_executor thread would block indefinitely. Add a 120s timeout so the socket raises and the request fails rather than silently tying up a thread forever. Co-Authored-By: Claude Opus 4.6 (1M context) --- fileglancer/worker_pool.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index 7a66beb1..81a0cc6f 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -317,8 +317,10 @@ async def _spawn_worker(self, username: str) -> UserWorker: # Close child's end in the parent child_sock.close() - # Keep the socket blocking — all I/O runs in a thread via run_in_executor + # Keep the socket blocking — all I/O runs in a thread via run_in_executor. + # Set a timeout so a hung worker can't block a thread forever. parent_sock.setblocking(True) + parent_sock.settimeout(120) # Start a background task to forward worker stderr to loguru asyncio.create_task(self._forward_stderr(username, process)) From 7a7e407d0392b60dd709c01084d833662c755543 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Mon, 13 Apr 2026 09:00:02 -0400 Subject: [PATCH 19/39] remove dead code (EffectiveUserContext) --- fileglancer/apps/core.py | 2 +- fileglancer/server.py | 15 ---- fileglancer/user_context.py | 132 ------------------------------------ 3 files changed, 1 insertion(+), 148 deletions(-) delete mode 100644 fileglancer/user_context.py diff --git a/fileglancer/apps/core.py b/fileglancer/apps/core.py index d3a15045..31c2f2c2 100644 --- a/fileglancer/apps/core.py +++ b/fileglancer/apps/core.py @@ -140,7 +140,7 @@ async def _ensure_repo_cache(url: str, pull: bool = False, When username is provided, the work is delegated to a worker subprocess that runs with the target user's real UID/GID, avoiding the process-wide - euid race condition that EffectiveUserContext has with concurrent async + euid race condition that seteuid/setegid has with concurrent async requests. When username is None, git commands run in-process (used by the worker subprocess itself, or in single-user dev mode). """ diff --git a/fileglancer/server.py b/fileglancer/server.py index d0a27500..99a09a9a 100644 --- a/fileglancer/server.py +++ b/fileglancer/server.py @@ -35,7 +35,6 @@ from fileglancer.settings import get_settings from fileglancer.issues import create_jira_ticket, get_jira_ticket_details, delete_jira_ticket from fileglancer.utils import format_timestamp, guess_content_type, parse_range_header -from fileglancer.user_context import UserContext, EffectiveUserContext, CurrentUserContext, UserContextConfigurationError from fileglancer.filestore import Filestore, RootCheckError from fileglancer.log import AccessLogMiddleware from fileglancer.worker_pool import WorkerPool, WorkerError, WorkerDead @@ -218,12 +217,6 @@ def create_app(settings): # Per-user persistent worker pool (only used when use_access_flags=True) worker_pool = WorkerPool(settings) if settings.use_access_flags else None - def _get_user_context(username: str) -> UserContext: - if settings.use_access_flags: - return EffectiveUserContext(username) - else: - return CurrentUserContext() - async def _worker_exec(username: str, action: str, **kwargs): """Dispatch an action to the per-user worker and return the result. @@ -466,14 +459,6 @@ async def validation_exception_handler(request, exc): return JSONResponse({"error":str(exc)}, status_code=400) - @app.exception_handler(UserContextConfigurationError) - async def user_context_config_error_handler(request, exc): - logger.error(f"User context configuration error: {exc}") - return JSONResponse( - {"error": str(exc)}, - status_code=500 - ) - @app.exception_handler(PermissionError) async def permission_error_handler(request, exc): error_msg = str(exc) diff --git a/fileglancer/user_context.py b/fileglancer/user_context.py deleted file mode 100644 index 6c455553..00000000 --- a/fileglancer/user_context.py +++ /dev/null @@ -1,132 +0,0 @@ -import os -import pwd -from contextlib import AbstractContextManager - -from loguru import logger - -from fileglancer.settings import get_settings - - -class UserContextConfigurationError(PermissionError): - """ - Raised when user context setup fails due to configuration issues. - This happens when use_access_flags=true but the server is not running with sufficient privileges. - """ - def __init__(self, message: str = "Server configuration error: Run the server as root or set use_access_flags=false in config.yaml"): - super().__init__(message) - - -class UserContext(AbstractContextManager): - """ - Base no-op proxy context that does nothing. - """ - def __exit__(self, exc_type, exc_val, exc_tb): - return False - - -class CurrentUserContext(UserContext): - """ - A context manager the keeps the current user context. - """ - pass - - -class EffectiveUserContext(UserContext): - """ - A context manager for setting the user and group context for a process using seteuid/setegid access flags. - """ - def __init__(self, username: str): - self.username = username - self._uid = os.getuid() - self._gid = os.getgid() - self._gids = os.getgrouplist(pwd.getpwuid(self._uid).pw_name, self._gid) - self._user = None - - def __enter__(self): - logger.debug( - f"EffectiveUserContext entering for {self.username} " - f"(current euid={os.geteuid()} egid={os.getegid()})" - ) - user = pwd.getpwnam(self.username) - - uid = user.pw_uid - gid = user.pw_gid - gids = os.getgrouplist(self.username, gid) - try: - os.setegid(gid) - except PermissionError as e: - logger.error(f"Failed to set the effective gid: {e}") - settings = get_settings() - if settings.use_access_flags: - raise UserContextConfigurationError() from e - else: - raise - except Exception as e: - logger.error(f"Failed to set the effective gid: {e}") - raise e - - try: - # the maximum number of groups that could be set is os.sysconf("SC_NGROUPS_MAX") - # so if the current user has more than that an exception will be raised - # for now I don't limit this because I want to see if this will happen - if len(gids) > os.sysconf("SC_NGROUPS_MAX"): - logger.warning(( - f"User {self.username} is part of {len(gids)} groups " - f"which is greater than {os.sysconf("SC_NGROUPS_MAX")} " - "so this may result in an error" - )) - os.setgroups(gids) - except PermissionError as e: - logger.error(f"Failed to set the user groups: {e}") - # reset egid first - os.setegid(self._gid) - settings = get_settings() - if settings.use_access_flags: - raise UserContextConfigurationError() from e - else: - raise - except Exception as e: - logger.error(f"Failed to set the user groups: {e}") - # reset egid first - os.setegid(self._gid) - raise e - - try: - os.seteuid(uid) - except PermissionError as e: - logger.error(f"Failed to set euid: {e}") - # reset egid - os.setegid(self._gid) - settings = get_settings() - if settings.use_access_flags: - raise UserContextConfigurationError() from e - else: - raise - except Exception as e: - logger.error(f"Failed to set euid: {e}") - # reset egid - os.setegid(self._gid) - raise e - - self._user = user - logger.debug( - f"EffectiveUserContext now running as {self.username} " - f"(euid={os.geteuid()} egid={os.getegid()})" - ) - - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - logger.debug( - f"EffectiveUserContext exiting for {self.username} " - f"(restoring euid={self._uid} egid={self._gid})" - ) - os.seteuid(self._uid) - os.setegid(self._gid) - if len(self._gids) > os.sysconf("SC_NGROUPS_MAX"): - logger.info(f"Truncate original {len(self._gids)} groups to max allowed to set: {os.sysconf("SC_NGROUPS_MAX")}") - os.setgroups(self._gids[:os.sysconf("SC_NGROUPS_MAX")]) - else: - os.setgroups(self._gids) - self._user = None - return False From ff3d87efc0142d6dacbcb162a18e0d86af918333 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 17:41:31 -0400 Subject: [PATCH 20/39] linting --- frontend/src/components/ui/BrowsePage/FileBrowser.tsx | 6 +++--- frontend/src/components/ui/Dialogs/ChangePermissions.tsx | 3 ++- .../src/components/ui/PropertiesDrawer/PropertiesDrawer.tsx | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/frontend/src/components/ui/BrowsePage/FileBrowser.tsx b/frontend/src/components/ui/BrowsePage/FileBrowser.tsx index 7beea4b9..cab32248 100644 --- a/frontend/src/components/ui/BrowsePage/FileBrowser.tsx +++ b/frontend/src/components/ui/BrowsePage/FileBrowser.tsx @@ -128,9 +128,9 @@ export default function FileBrowser({ const propertiesTarget = fileBrowserState.propertiesTarget; const isFavorite = Boolean( fspName && - folderPreferenceMap[ - makeMapKey('folder', `${fspName}_${propertiesTarget.path}`) - ] + folderPreferenceMap[ + makeMapKey('folder', `${fspName}_${propertiesTarget.path}`) + ] ); return [ diff --git a/frontend/src/components/ui/Dialogs/ChangePermissions.tsx b/frontend/src/components/ui/Dialogs/ChangePermissions.tsx index 48fa2ea4..63573cb4 100644 --- a/frontend/src/components/ui/Dialogs/ChangePermissions.tsx +++ b/frontend/src/components/ui/Dialogs/ChangePermissions.tsx @@ -228,7 +228,8 @@ export default function ChangePermissions({ className="!rounded-md" disabled={Boolean( mutations.changePermissions.isPending || - localPermissions === fileBrowserState.propertiesTarget.permissions + localPermissions === + fileBrowserState.propertiesTarget.permissions )} type="submit" > diff --git a/frontend/src/components/ui/PropertiesDrawer/PropertiesDrawer.tsx b/frontend/src/components/ui/PropertiesDrawer/PropertiesDrawer.tsx index 6bf87496..b94e9bae 100644 --- a/frontend/src/components/ui/PropertiesDrawer/PropertiesDrawer.tsx +++ b/frontend/src/components/ui/PropertiesDrawer/PropertiesDrawer.tsx @@ -279,7 +279,7 @@ export default function PropertiesDrawer({ } disabled={Boolean( externalDataUrlQuery.data || - fileBrowserState.propertiesTarget.hasRead === false + fileBrowserState.propertiesTarget.hasRead === false )} id="share-switch" label={ From 63fa53a064b2a8a6929871096d24d45246544004 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 17:50:20 -0400 Subject: [PATCH 21/39] guard pwd import for Windows compatibility worker_pool and user_worker imported pwd at module top-level, breaking import on Windows where pwd doesn't exist. Wrap in try/except per the same pattern used for user_context.py on main. Also skip the entire test_worker.py module on Windows since the worker subsystem uses fork/setuid/SCM_RIGHTS. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/user_worker.py | 5 ++++- fileglancer/worker_pool.py | 5 ++++- tests/test_worker.py | 6 ++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index d70e8295..cd95e305 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -27,7 +27,10 @@ import json import logging import os -import pwd +try: + import pwd +except ImportError: + pwd = None # type: ignore[assignment] import socket import struct import sys diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index 81a0cc6f..ea0e9ab1 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -27,7 +27,10 @@ import asyncio import json import os -import pwd +try: + import pwd +except ImportError: + pwd = None # type: ignore[assignment] import socket import struct import subprocess diff --git a/tests/test_worker.py b/tests/test_worker.py index 214e9ce2..3f451af0 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -10,11 +10,17 @@ import os import socket import struct +import sys import tempfile import time import pytest +pytestmark = pytest.mark.skipif( + sys.platform == "win32", + reason="Worker subsystem uses fork/setuid/SCM_RIGHTS (Unix-only)", +) + from fileglancer.user_worker import ( _send, _send_with_fd, From 2023f840f6ba59214d95d5bafbcfe2f1575876cc Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 17:58:25 -0400 Subject: [PATCH 22/39] fix Windows build: prettier 3.8.3 reformat + grp import guard - Reformat 3 frontend files with prettier 3.8.3 (the version pinned on main; my earlier run used cached 3.6.2 which had different rules) - Move 'import grp as _grp' inside the existing try/except in _action_get_profile so the get_profile endpoint returns an empty groups list on Windows instead of 500-ing on missing grp module Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/user_worker.py | 3 +-- frontend/src/components/ui/BrowsePage/FileBrowser.tsx | 6 +++--- frontend/src/components/ui/Dialogs/ChangePermissions.tsx | 3 +-- .../src/components/ui/PropertiesDrawer/PropertiesDrawer.tsx | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index cd95e305..c5d18ec1 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -513,8 +513,6 @@ def _action_validate_paths(request: dict, ctx: WorkerContext) -> dict: def _action_get_profile(request: dict, ctx: WorkerContext) -> dict: """Get user profile information.""" - import grp as _grp - from fileglancer import database as db username = ctx.username @@ -535,6 +533,7 @@ def _action_get_profile(request: dict, ctx: WorkerContext) -> dict: user_groups = [] try: + import grp as _grp user_info = pwd.getpwnam(username) all_groups = _grp.getgrall() for group in all_groups: diff --git a/frontend/src/components/ui/BrowsePage/FileBrowser.tsx b/frontend/src/components/ui/BrowsePage/FileBrowser.tsx index cab32248..7beea4b9 100644 --- a/frontend/src/components/ui/BrowsePage/FileBrowser.tsx +++ b/frontend/src/components/ui/BrowsePage/FileBrowser.tsx @@ -128,9 +128,9 @@ export default function FileBrowser({ const propertiesTarget = fileBrowserState.propertiesTarget; const isFavorite = Boolean( fspName && - folderPreferenceMap[ - makeMapKey('folder', `${fspName}_${propertiesTarget.path}`) - ] + folderPreferenceMap[ + makeMapKey('folder', `${fspName}_${propertiesTarget.path}`) + ] ); return [ diff --git a/frontend/src/components/ui/Dialogs/ChangePermissions.tsx b/frontend/src/components/ui/Dialogs/ChangePermissions.tsx index 63573cb4..48fa2ea4 100644 --- a/frontend/src/components/ui/Dialogs/ChangePermissions.tsx +++ b/frontend/src/components/ui/Dialogs/ChangePermissions.tsx @@ -228,8 +228,7 @@ export default function ChangePermissions({ className="!rounded-md" disabled={Boolean( mutations.changePermissions.isPending || - localPermissions === - fileBrowserState.propertiesTarget.permissions + localPermissions === fileBrowserState.propertiesTarget.permissions )} type="submit" > diff --git a/frontend/src/components/ui/PropertiesDrawer/PropertiesDrawer.tsx b/frontend/src/components/ui/PropertiesDrawer/PropertiesDrawer.tsx index b94e9bae..6bf87496 100644 --- a/frontend/src/components/ui/PropertiesDrawer/PropertiesDrawer.tsx +++ b/frontend/src/components/ui/PropertiesDrawer/PropertiesDrawer.tsx @@ -279,7 +279,7 @@ export default function PropertiesDrawer({ } disabled={Boolean( externalDataUrlQuery.data || - fileBrowserState.propertiesTarget.hasRead === false + fileBrowserState.propertiesTarget.hasRead === false )} id="share-switch" label={ From 8d78fe293c00a8c40f66328d0800da8e735c0c75 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 18:24:28 -0400 Subject: [PATCH 23/39] cache verified Filestore instances per worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _get_filestore did a real filesystem stat on the mount root for every file action. Cache successful lookups for the worker's lifetime — mount status doesn't change mid-session, and workers are evicted on idle so a remount picks up next spawn. Only successes are cached; 'not found' / 'not mounted' still retry. Test fixture clears the cache between tests since each test creates a fresh fsp with the same name pointing at a different tempdir. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/user_worker.py | 12 ++++++++++++ tests/test_endpoints.py | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index c5d18ec1..047250e7 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -131,8 +131,19 @@ def _recvall(sock: socket.socket, n: int) -> Optional[bytes]: # Action handlers — file operations # --------------------------------------------------------------------------- +# Per-worker cache of verified Filestore instances. Once a mount has been +# successfully verified, we trust it for the lifetime of the worker process — +# workers are short-lived enough (idle eviction) that we don't need to handle +# unmount/remount mid-session. +_filestore_cache: dict[str, Any] = {} + + def _get_filestore(fsp_name: str, db_url: str): """Look up a FileSharePath and return a Filestore instance.""" + cached = _filestore_cache.get(fsp_name) + if cached is not None: + return cached, None + from fileglancer import database as db from fileglancer.filestore import Filestore @@ -147,6 +158,7 @@ def _get_filestore(fsp_name: str, db_url: str): except FileNotFoundError: return None, f"File share path '{fsp_name}' is not mounted" + _filestore_cache[fsp_name] = filestore return filestore, None diff --git a/tests/test_endpoints.py b/tests/test_endpoints.py index ae814ed7..fbdaeb1a 100644 --- a/tests/test_endpoints.py +++ b/tests/test_endpoints.py @@ -85,6 +85,11 @@ def test_app(temp_dir): from fileglancer.database import dispose_engine dispose_engine(db_url) + # Clear the per-process filestore cache so subsequent tests don't see + # stale Filestore instances pointing at this test's temp directory + from fileglancer.user_worker import _filestore_cache + _filestore_cache.clear() + # Restore original get_settings and clear cache fileglancer.settings.get_settings = original_get_settings fileglancer.database.get_settings = original_get_settings From e28b6e4b6d797c1c66e2c5de14b6984f0be4b042 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 18:29:09 -0400 Subject: [PATCH 24/39] add @with_filestore decorator for action handlers 13 file-action handlers had identical fsp resolution boilerplate. Extract into a decorator that resolves request['fsp_name'] and passes the Filestore as the third arg, returning a 404/500 error response if resolution fails. Side benefit: handlers that previously returned {'error': ...} without a status_code now consistently get 404 for missing fsps instead of defaulting to 500. Net -34 lines. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/user_worker.py | 120 +++++++++++++------------------------ 1 file changed, 43 insertions(+), 77 deletions(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index 047250e7..2b5861a4 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -24,6 +24,7 @@ import asyncio import ctypes import ctypes.util +import functools import json import logging import os @@ -162,16 +163,28 @@ def _get_filestore(fsp_name: str, db_url: str): return filestore, None -def _action_list_dir(request: dict, ctx: WorkerContext) -> dict: +def with_filestore(fn): + """Resolve request["fsp_name"] to a Filestore and pass it as the third arg. + + Returns an error response if the filestore can't be resolved (404 for + missing fsp, 500 for unmounted) so the handler body never has to deal + with the not-found case. + """ + @functools.wraps(fn) + def wrapper(request: dict, ctx: WorkerContext) -> dict: + filestore, error = _get_filestore(request["fsp_name"], ctx.db_url) + if filestore is None: + return {"error": error, "status_code": 404 if "not found" in error else 500} + return fn(request, ctx, filestore) + return wrapper + + +@with_filestore +def _action_list_dir(request: dict, ctx: WorkerContext, filestore) -> dict: """List directory contents.""" - fsp_name = request["fsp_name"] subpath = request.get("subpath", "") current_user = ctx.username - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error, "status_code": 404 if "not found" in error else 500} - from fileglancer import database as db from fileglancer.filestore import RootCheckError @@ -208,18 +221,14 @@ def _action_list_dir(request: dict, ctx: WorkerContext) -> dict: return {"error": "Permission denied", "status_code": 403} -def _action_list_dir_paged(request: dict, ctx: WorkerContext) -> dict: +@with_filestore +def _action_list_dir_paged(request: dict, ctx: WorkerContext, filestore) -> dict: """List directory contents with pagination.""" - fsp_name = request["fsp_name"] subpath = request.get("subpath", "") current_user = ctx.username limit = request.get("limit", 200) cursor = request.get("cursor") - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error, "status_code": 404 if "not found" in error else 500} - from fileglancer import database as db from fileglancer.filestore import RootCheckError @@ -261,15 +270,11 @@ def _action_list_dir_paged(request: dict, ctx: WorkerContext) -> dict: return {"error": "Permission denied", "status_code": 403} -def _action_get_file_info(request: dict, ctx: WorkerContext) -> dict: +@with_filestore +def _action_get_file_info(request: dict, ctx: WorkerContext, filestore) -> dict: """Get metadata for a single file or directory.""" - fsp_name = request["fsp_name"] subpath = request.get("subpath", "") - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error, "status_code": 404 if "not found" in error else 500} - from fileglancer import database as db try: @@ -282,15 +287,11 @@ def _action_get_file_info(request: dict, ctx: WorkerContext) -> dict: return {"error": "Permission denied", "status_code": 403} -def _action_check_binary(request: dict, ctx: WorkerContext) -> dict: +@with_filestore +def _action_check_binary(request: dict, ctx: WorkerContext, filestore) -> dict: """Check if a file is binary.""" - fsp_name = request["fsp_name"] subpath = request.get("subpath", "") - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error, "status_code": 404 if "not found" in error else 500} - try: is_binary = filestore.check_is_binary(subpath) return {"is_binary": is_binary} @@ -300,7 +301,8 @@ def _action_check_binary(request: dict, ctx: WorkerContext) -> dict: return {"error": "Permission denied", "status_code": 403} -def _action_open_file(request: dict, ctx: WorkerContext) -> dict: +@with_filestore +def _action_open_file(request: dict, ctx: WorkerContext, filestore) -> dict: """Open a file and return its metadata + open file descriptor. The worker opens the file as the user and passes the fd back to the @@ -310,10 +312,6 @@ def _action_open_file(request: dict, ctx: WorkerContext) -> dict: fsp_name = request["fsp_name"] subpath = request.get("subpath", "") - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error} - from fileglancer.filestore import RootCheckError from fileglancer.utils import guess_content_type @@ -350,15 +348,11 @@ def _action_open_file(request: dict, ctx: WorkerContext) -> dict: return {"error": f"Permission denied: {fsp_name}/{subpath}", "status_code": 403} -def _action_head_file(request: dict, ctx: WorkerContext) -> dict: +@with_filestore +def _action_head_file(request: dict, ctx: WorkerContext, filestore) -> dict: """Get file metadata and binary check for HEAD requests.""" - fsp_name = request["fsp_name"] subpath = request.get("subpath", "") - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error} - from fileglancer.filestore import RootCheckError from fileglancer.utils import guess_content_type @@ -387,15 +381,11 @@ def _action_head_file(request: dict, ctx: WorkerContext) -> dict: return {"error": "Permission denied", "status_code": 403} -def _action_create_dir(request: dict, ctx: WorkerContext) -> dict: +@with_filestore +def _action_create_dir(request: dict, ctx: WorkerContext, filestore) -> dict: """Create a directory.""" - fsp_name = request["fsp_name"] subpath = request["subpath"] - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error} - try: filestore.create_dir(subpath) return {"ok": True} @@ -405,15 +395,11 @@ def _action_create_dir(request: dict, ctx: WorkerContext) -> dict: return {"error": str(e), "status_code": 403} -def _action_create_file(request: dict, ctx: WorkerContext) -> dict: +@with_filestore +def _action_create_file(request: dict, ctx: WorkerContext, filestore) -> dict: """Create an empty file.""" - fsp_name = request["fsp_name"] subpath = request["subpath"] - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error} - try: filestore.create_empty_file(subpath) return {"ok": True} @@ -423,16 +409,12 @@ def _action_create_file(request: dict, ctx: WorkerContext) -> dict: return {"error": str(e), "status_code": 403} -def _action_rename(request: dict, ctx: WorkerContext) -> dict: +@with_filestore +def _action_rename(request: dict, ctx: WorkerContext, filestore) -> dict: """Rename a file or directory.""" - fsp_name = request["fsp_name"] old_path = request["old_path"] new_path = request["new_path"] - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error} - try: filestore.rename_file_or_dir(old_path, new_path) return {"ok": True} @@ -442,15 +424,11 @@ def _action_rename(request: dict, ctx: WorkerContext) -> dict: return {"error": str(e), "status_code": 500} -def _action_delete(request: dict, ctx: WorkerContext) -> dict: +@with_filestore +def _action_delete(request: dict, ctx: WorkerContext, filestore) -> dict: """Delete a file or directory.""" - fsp_name = request["fsp_name"] subpath = request["subpath"] - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error} - try: filestore.remove_file_or_dir(subpath) return {"ok": True} @@ -460,16 +438,12 @@ def _action_delete(request: dict, ctx: WorkerContext) -> dict: return {"error": str(e), "status_code": 500} -def _action_chmod(request: dict, ctx: WorkerContext) -> dict: +@with_filestore +def _action_chmod(request: dict, ctx: WorkerContext, filestore) -> dict: """Change file permissions.""" - fsp_name = request["fsp_name"] subpath = request["subpath"] permissions = request["permissions"] - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error} - try: filestore.change_file_permissions(subpath, permissions) return {"ok": True} @@ -479,17 +453,13 @@ def _action_chmod(request: dict, ctx: WorkerContext) -> dict: return {"error": str(e), "status_code": 500} -def _action_update_file(request: dict, ctx: WorkerContext) -> dict: +@with_filestore +def _action_update_file(request: dict, ctx: WorkerContext, filestore) -> dict: """Handle rename and/or permission change on a file.""" - fsp_name = request["fsp_name"] subpath = request.get("subpath", "") new_path = request.get("new_path") new_permissions = request.get("new_permissions") - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error} - try: old_file_info = filestore.get_file_info(subpath, ctx.username) result = {"info": old_file_info.model_dump(mode="json")} @@ -747,19 +717,15 @@ def _action_s3_open_object(request: dict, ctx: WorkerContext) -> dict: # Action handlers — proxied path validation # --------------------------------------------------------------------------- -def _action_validate_proxied_path(request: dict, ctx: WorkerContext) -> dict: +@with_filestore +def _action_validate_proxied_path(request: dict, ctx: WorkerContext, filestore) -> dict: """Validate that the user can access a proxied path. Runs within the user's context (the worker IS the user), so filesystem permission checks just work. """ - fsp_name = request["fsp_name"] path = request["path"] - filestore, error = _get_filestore(fsp_name, ctx.db_url) - if filestore is None: - return {"error": error} - try: filestore.get_file_info(path) return {"ok": True} From cdc34d894b6014e67e19c1a2ef467a10bf485ad1 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 18:33:52 -0400 Subject: [PATCH 25/39] add @action decorator for handler registration Replace the 47-line manual _ACTIONS registry at the bottom with @action(name) decorators co-located with each handler. New handlers can't silently miss the registry, and the action name is visible alongside the function definition. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/user_worker.py | 90 +++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index 2b5861a4..a316be31 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -128,6 +128,22 @@ def _recvall(sock: socket.socket, n: int) -> Optional[bytes]: return bytes(data) +# --------------------------------------------------------------------------- +# Action registry +# --------------------------------------------------------------------------- + +# Populated by @action(name) decorators on each handler. +_ACTIONS: dict[str, Any] = {} + + +def action(name: str): + """Register a handler under the given action name.""" + def decorator(fn): + _ACTIONS[name] = fn + return fn + return decorator + + # --------------------------------------------------------------------------- # Action handlers — file operations # --------------------------------------------------------------------------- @@ -179,6 +195,7 @@ def wrapper(request: dict, ctx: WorkerContext) -> dict: return wrapper +@action("list_dir") @with_filestore def _action_list_dir(request: dict, ctx: WorkerContext, filestore) -> dict: """List directory contents.""" @@ -221,6 +238,7 @@ def _action_list_dir(request: dict, ctx: WorkerContext, filestore) -> dict: return {"error": "Permission denied", "status_code": 403} +@action("list_dir_paged") @with_filestore def _action_list_dir_paged(request: dict, ctx: WorkerContext, filestore) -> dict: """List directory contents with pagination.""" @@ -270,6 +288,7 @@ def _action_list_dir_paged(request: dict, ctx: WorkerContext, filestore) -> dict return {"error": "Permission denied", "status_code": 403} +@action("get_file_info") @with_filestore def _action_get_file_info(request: dict, ctx: WorkerContext, filestore) -> dict: """Get metadata for a single file or directory.""" @@ -287,6 +306,7 @@ def _action_get_file_info(request: dict, ctx: WorkerContext, filestore) -> dict: return {"error": "Permission denied", "status_code": 403} +@action("check_binary") @with_filestore def _action_check_binary(request: dict, ctx: WorkerContext, filestore) -> dict: """Check if a file is binary.""" @@ -301,6 +321,7 @@ def _action_check_binary(request: dict, ctx: WorkerContext, filestore) -> dict: return {"error": "Permission denied", "status_code": 403} +@action("open_file") @with_filestore def _action_open_file(request: dict, ctx: WorkerContext, filestore) -> dict: """Open a file and return its metadata + open file descriptor. @@ -348,6 +369,7 @@ def _action_open_file(request: dict, ctx: WorkerContext, filestore) -> dict: return {"error": f"Permission denied: {fsp_name}/{subpath}", "status_code": 403} +@action("head_file") @with_filestore def _action_head_file(request: dict, ctx: WorkerContext, filestore) -> dict: """Get file metadata and binary check for HEAD requests.""" @@ -381,6 +403,7 @@ def _action_head_file(request: dict, ctx: WorkerContext, filestore) -> dict: return {"error": "Permission denied", "status_code": 403} +@action("create_dir") @with_filestore def _action_create_dir(request: dict, ctx: WorkerContext, filestore) -> dict: """Create a directory.""" @@ -395,6 +418,7 @@ def _action_create_dir(request: dict, ctx: WorkerContext, filestore) -> dict: return {"error": str(e), "status_code": 403} +@action("create_file") @with_filestore def _action_create_file(request: dict, ctx: WorkerContext, filestore) -> dict: """Create an empty file.""" @@ -409,6 +433,7 @@ def _action_create_file(request: dict, ctx: WorkerContext, filestore) -> dict: return {"error": str(e), "status_code": 403} +@action("rename") @with_filestore def _action_rename(request: dict, ctx: WorkerContext, filestore) -> dict: """Rename a file or directory.""" @@ -424,6 +449,7 @@ def _action_rename(request: dict, ctx: WorkerContext, filestore) -> dict: return {"error": str(e), "status_code": 500} +@action("delete") @with_filestore def _action_delete(request: dict, ctx: WorkerContext, filestore) -> dict: """Delete a file or directory.""" @@ -438,6 +464,7 @@ def _action_delete(request: dict, ctx: WorkerContext, filestore) -> dict: return {"error": str(e), "status_code": 500} +@action("chmod") @with_filestore def _action_chmod(request: dict, ctx: WorkerContext, filestore) -> dict: """Change file permissions.""" @@ -453,6 +480,7 @@ def _action_chmod(request: dict, ctx: WorkerContext, filestore) -> dict: return {"error": str(e), "status_code": 500} +@action("update_file") @with_filestore def _action_update_file(request: dict, ctx: WorkerContext, filestore) -> dict: """Handle rename and/or permission change on a file.""" @@ -478,6 +506,7 @@ def _action_update_file(request: dict, ctx: WorkerContext, filestore) -> dict: return {"error": str(e), "status_code": 500} +@action("validate_paths") def _action_validate_paths(request: dict, ctx: WorkerContext) -> dict: """Validate file/directory paths for app parameters.""" from fileglancer.apps.core import validate_path_in_filestore @@ -493,6 +522,7 @@ def _action_validate_paths(request: dict, ctx: WorkerContext) -> dict: return {"errors": errors} +@action("get_profile") def _action_get_profile(request: dict, ctx: WorkerContext) -> dict: """Get user profile information.""" from fileglancer import database as db @@ -539,6 +569,7 @@ def _action_get_profile(request: dict, ctx: WorkerContext) -> dict: # Action handlers — SSH keys # --------------------------------------------------------------------------- +@action("list_ssh_keys") def _action_list_ssh_keys(request: dict, ctx: WorkerContext) -> dict: """List SSH keys.""" from fileglancer import sshkeys @@ -550,6 +581,7 @@ def _action_list_ssh_keys(request: dict, ctx: WorkerContext) -> dict: return {"error": str(e), "status_code": 500} +@action("generate_ssh_key") def _action_generate_ssh_key(request: dict, ctx: WorkerContext) -> dict: """Generate a temporary SSH key and authorize it.""" from fileglancer import sshkeys @@ -571,6 +603,7 @@ def _action_generate_ssh_key(request: dict, ctx: WorkerContext) -> dict: # Action handlers — job files # --------------------------------------------------------------------------- +@action("get_job_file") def _action_get_job_file(request: dict, ctx: WorkerContext) -> dict: """Read job file content (script, stdout, stderr).""" from fileglancer.apps.core import get_job_file_content @@ -582,6 +615,7 @@ def _action_get_job_file(request: dict, ctx: WorkerContext) -> dict: return {"content": content} +@action("get_job_file_paths") def _action_get_job_file_paths(request: dict, ctx: WorkerContext) -> dict: """Get job file path info.""" from fileglancer.apps.core import get_job_file_paths @@ -597,6 +631,7 @@ def _action_get_job_file_paths(request: dict, ctx: WorkerContext) -> dict: return {"files": files} +@action("get_service_url") def _action_get_service_url(request: dict, ctx: WorkerContext) -> dict: """Read service URL from job work directory.""" from fileglancer.apps.core import get_service_url @@ -616,6 +651,7 @@ def _action_get_service_url(request: dict, ctx: WorkerContext) -> dict: # Action handlers — S3 proxy # --------------------------------------------------------------------------- +@action("s3_list_objects") def _action_s3_list_objects(request: dict, ctx: WorkerContext) -> dict: """S3-compatible list objects.""" from x2s3.client_file import FileProxyClient @@ -644,6 +680,7 @@ def _action_s3_list_objects(request: dict, ctx: WorkerContext) -> dict: return {"body": result.body.decode(), "media_type": result.media_type, "status_code": result.status_code} +@action("s3_head_object") def _action_s3_head_object(request: dict, ctx: WorkerContext) -> dict: """S3-compatible head object.""" from x2s3.client_file import FileProxyClient @@ -662,6 +699,7 @@ def _action_s3_head_object(request: dict, ctx: WorkerContext) -> dict: return {"headers": headers, "status_code": result.status_code} +@action("s3_open_object") def _action_s3_open_object(request: dict, ctx: WorkerContext) -> dict: """S3-compatible open object — open the file and pass the fd back. @@ -717,6 +755,7 @@ def _action_s3_open_object(request: dict, ctx: WorkerContext) -> dict: # Action handlers — proxied path validation # --------------------------------------------------------------------------- +@action("validate_proxied_path") @with_filestore def _action_validate_proxied_path(request: dict, ctx: WorkerContext, filestore) -> dict: """Validate that the user can access a proxied path. @@ -737,6 +776,7 @@ def _action_validate_proxied_path(request: dict, ctx: WorkerContext, filestore) # Action handlers — cluster operations (absorbed from apps/worker.py) # --------------------------------------------------------------------------- +@action("submit") def _action_submit(request: dict, ctx: WorkerContext) -> dict: """Create work dir, symlink repo, submit job via py-cluster-api.""" from cluster_api import create_executor, ResourceSpec @@ -778,6 +818,7 @@ def _action_submit(request: dict, ctx: WorkerContext) -> dict: return {"job_id": job.job_id, "script_path": job.script_path} +@action("cancel") def _action_cancel(request: dict, ctx: WorkerContext) -> dict: """Cancel a cluster job via py-cluster-api.""" from cluster_api import create_executor @@ -790,6 +831,7 @@ def _action_cancel(request: dict, ctx: WorkerContext) -> dict: return {"status": "ok"} +@action("poll") def _action_poll(request: dict, ctx: WorkerContext) -> dict: """Poll job statuses via py-cluster-api.""" from cluster_api import create_executor @@ -829,6 +871,7 @@ def _action_poll(request: dict, ctx: WorkerContext) -> dict: return {"jobs": jobs} +@action("reconnect") def _action_reconnect(request: dict, ctx: WorkerContext) -> dict: """Reconnect to existing jobs via py-cluster-api.""" from cluster_api import create_executor @@ -857,6 +900,7 @@ def _action_reconnect(request: dict, ctx: WorkerContext) -> dict: # Action handlers — git/manifest operations (absorbed from apps/worker.py) # --------------------------------------------------------------------------- +@action("ensure_repo") def _action_ensure_repo(request: dict, ctx: WorkerContext) -> dict: """Clone or update a GitHub repo in the current user's cache.""" from fileglancer.apps.core import _ensure_repo_cache @@ -867,6 +911,7 @@ def _action_ensure_repo(request: dict, ctx: WorkerContext) -> dict: return {"repo_dir": str(repo_dir)} +@action("discover_manifests") def _action_discover_manifests(request: dict, ctx: WorkerContext) -> dict: """Clone/pull repo and discover all manifests.""" from fileglancer.apps.core import _ensure_repo_cache, _find_manifests_in_repo @@ -883,6 +928,7 @@ def _action_discover_manifests(request: dict, ctx: WorkerContext) -> dict: } +@action("read_manifest") def _action_read_manifest(request: dict, ctx: WorkerContext) -> dict: """Fetch and read a single manifest from a cached repo.""" from fileglancer.apps.core import _ensure_repo_cache, _read_manifest_file @@ -896,50 +942,6 @@ def _action_read_manifest(request: dict, ctx: WorkerContext) -> dict: return {"manifest": manifest.model_dump(mode="json")} -# --------------------------------------------------------------------------- -# Action registry -# --------------------------------------------------------------------------- - -_ACTIONS: dict[str, Any] = { - # File operations - "list_dir": _action_list_dir, - "list_dir_paged": _action_list_dir_paged, - "get_file_info": _action_get_file_info, - "check_binary": _action_check_binary, - "open_file": _action_open_file, - "head_file": _action_head_file, - "create_dir": _action_create_dir, - "create_file": _action_create_file, - "rename": _action_rename, - "delete": _action_delete, - "chmod": _action_chmod, - "update_file": _action_update_file, - "validate_paths": _action_validate_paths, - "validate_proxied_path": _action_validate_proxied_path, - "get_profile": _action_get_profile, - # SSH keys - "list_ssh_keys": _action_list_ssh_keys, - "generate_ssh_key": _action_generate_ssh_key, - # Job files - "get_job_file": _action_get_job_file, - "get_job_file_paths": _action_get_job_file_paths, - "get_service_url": _action_get_service_url, - # S3 proxy - "s3_list_objects": _action_s3_list_objects, - "s3_head_object": _action_s3_head_object, - "s3_open_object": _action_s3_open_object, - # Cluster operations - "submit": _action_submit, - "cancel": _action_cancel, - "poll": _action_poll, - "reconnect": _action_reconnect, - # Git/manifest operations - "ensure_repo": _action_ensure_repo, - "discover_manifests": _action_discover_manifests, - "read_manifest": _action_read_manifest, -} - - # --------------------------------------------------------------------------- # Worker context and main loop # --------------------------------------------------------------------------- From e1eb75bdc2527607425ffc2c1dfe2117412d63d2 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 18:34:49 -0400 Subject: [PATCH 26/39] extract _get_executor helper for cluster handlers The 4 cluster action handlers (submit, cancel, poll, reconnect) each repeated the same boilerplate: import create_executor, pop extra_args from cluster_config, call create_executor(**config). Pull it into one helper. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/user_worker.py | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index a316be31..420b8567 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -776,15 +776,21 @@ def _action_validate_proxied_path(request: dict, ctx: WorkerContext, filestore) # Action handlers — cluster operations (absorbed from apps/worker.py) # --------------------------------------------------------------------------- -@action("submit") -def _action_submit(request: dict, ctx: WorkerContext) -> dict: - """Create work dir, symlink repo, submit job via py-cluster-api.""" - from cluster_api import create_executor, ResourceSpec +def _get_executor(request: dict): + """Build a py-cluster-api executor from request['cluster_config'].""" + from cluster_api import create_executor config = request["cluster_config"] config.pop("extra_args", None) + return create_executor(**config) + + +@action("submit") +def _action_submit(request: dict, ctx: WorkerContext) -> dict: + """Create work dir, symlink repo, submit job via py-cluster-api.""" + from cluster_api import ResourceSpec - executor = create_executor(**config) + executor = _get_executor(request) work_dir = Path(request["work_dir"]) work_dir.mkdir(parents=True, exist_ok=True) @@ -821,12 +827,7 @@ def _action_submit(request: dict, ctx: WorkerContext) -> dict: @action("cancel") def _action_cancel(request: dict, ctx: WorkerContext) -> dict: """Cancel a cluster job via py-cluster-api.""" - from cluster_api import create_executor - - config = request["cluster_config"] - config.pop("extra_args", None) - - executor = create_executor(**config) + executor = _get_executor(request) _run_async(executor.cancel(request["job_id"])) return {"status": "ok"} @@ -834,13 +835,9 @@ def _action_cancel(request: dict, ctx: WorkerContext) -> dict: @action("poll") def _action_poll(request: dict, ctx: WorkerContext) -> dict: """Poll job statuses via py-cluster-api.""" - from cluster_api import create_executor from cluster_api._types import JobRecord, JobStatus - config = request["cluster_config"] - config.pop("extra_args", None) - - executor = create_executor(**config) + executor = _get_executor(request) known_statuses = request.get("job_statuses", {}) for cid in request["cluster_job_ids"]: @@ -874,12 +871,7 @@ def _action_poll(request: dict, ctx: WorkerContext) -> dict: @action("reconnect") def _action_reconnect(request: dict, ctx: WorkerContext) -> dict: """Reconnect to existing jobs via py-cluster-api.""" - from cluster_api import create_executor - - config = request["cluster_config"] - config.pop("extra_args", None) - - executor = create_executor(**config) + executor = _get_executor(request) reconnected = _run_async(executor.reconnect()) jobs = {} From 01f225bc4815e4bd4e472fbd90dcb806e21703dd Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 19:02:45 -0400 Subject: [PATCH 27/39] fix validate_proxied_path returning 500 instead of 400 in worker mode Without an explicit status_code, worker.execute() raised WorkerError with the default 500, bypassing the caller's intended 400 handling. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/user_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index 420b8567..ef21e63b 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -769,7 +769,7 @@ def _action_validate_proxied_path(request: dict, ctx: WorkerContext, filestore) filestore.get_file_info(path) return {"ok": True} except (FileNotFoundError, PermissionError) as e: - return {"error": str(e)} + return {"error": str(e), "status_code": 400} # --------------------------------------------------------------------------- From 57042b268d66b187c15008b74a19b36f51005b6d Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 19:03:38 -0400 Subject: [PATCH 28/39] return structured error from _get_filestore Replaces the substring match in with_filestore with explicit status codes returned alongside the error message. Unmounted file shares now return 503 instead of 500. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/user_worker.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index ef21e63b..eb11e8eb 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -156,7 +156,11 @@ def decorator(fn): def _get_filestore(fsp_name: str, db_url: str): - """Look up a FileSharePath and return a Filestore instance.""" + """Look up a FileSharePath and return a Filestore instance. + + Returns (filestore, None) on success, or (None, error_response) on failure + where error_response is a dict ready to be returned from a handler. + """ cached = _filestore_cache.get(fsp_name) if cached is not None: return cached, None @@ -167,13 +171,19 @@ def _get_filestore(fsp_name: str, db_url: str): with db.get_db_session(db_url) as session: fsp = db.get_file_share_path(session, fsp_name) if fsp is None: - return None, f"File share path '{fsp_name}' not found" + return None, { + "error": f"File share path '{fsp_name}' not found", + "status_code": 404, + } filestore = Filestore(fsp) try: filestore.get_file_info(None) except FileNotFoundError: - return None, f"File share path '{fsp_name}' is not mounted" + return None, { + "error": f"File share path '{fsp_name}' is not mounted", + "status_code": 503, + } _filestore_cache[fsp_name] = filestore return filestore, None @@ -182,15 +192,15 @@ def _get_filestore(fsp_name: str, db_url: str): def with_filestore(fn): """Resolve request["fsp_name"] to a Filestore and pass it as the third arg. - Returns an error response if the filestore can't be resolved (404 for - missing fsp, 500 for unmounted) so the handler body never has to deal - with the not-found case. + Returns an error response if the filestore can't be resolved (404 for a + missing fsp, 503 for an unmounted one) so the handler body never has to + deal with the not-found case. """ @functools.wraps(fn) def wrapper(request: dict, ctx: WorkerContext) -> dict: - filestore, error = _get_filestore(request["fsp_name"], ctx.db_url) + filestore, error_response = _get_filestore(request["fsp_name"], ctx.db_url) if filestore is None: - return {"error": error, "status_code": 404 if "not found" in error else 500} + return error_response return fn(request, ctx, filestore) return wrapper From e609c755ccdaa6b40de131e78bba394bb94e5520 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 19:04:46 -0400 Subject: [PATCH 29/39] stop mutating request dict in _get_executor Building a filtered copy avoids surprising callers that read cluster_config again after the executor is built. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/user_worker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index eb11e8eb..879339ab 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -790,8 +790,7 @@ def _get_executor(request: dict): """Build a py-cluster-api executor from request['cluster_config'].""" from cluster_api import create_executor - config = request["cluster_config"] - config.pop("extra_args", None) + config = {k: v for k, v in request["cluster_config"].items() if k != "extra_args"} return create_executor(**config) From 62001fcddb42df855163404a7f73f00a4a5cc728 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 19:08:47 -0400 Subject: [PATCH 30/39] use Executor.track() in poll handler instead of _jobs Switches from reaching into the executor's private _jobs dict to the public track() method added in py-cluster-api. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/user_worker.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index 879339ab..8d768c3b 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -844,7 +844,7 @@ def _action_cancel(request: dict, ctx: WorkerContext) -> dict: @action("poll") def _action_poll(request: dict, ctx: WorkerContext) -> dict: """Poll job statuses via py-cluster-api.""" - from cluster_api._types import JobRecord, JobStatus + from cluster_api import JobStatus executor = _get_executor(request) @@ -855,12 +855,7 @@ def _action_poll(request: dict, ctx: WorkerContext) -> dict: seed_status = JobStatus(db_status) except ValueError: seed_status = JobStatus.PENDING - executor._jobs[cid] = JobRecord( - job_id=cid, - name="", - command="", - status=seed_status, - ) + executor.track(cid, status=seed_status) _run_async(executor.poll()) From d1bd656ce0532527dabd60890d38bcb9cd0e4e7a Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 19:12:28 -0400 Subject: [PATCH 31/39] bump py-cluster-api to >=0.6.0 Pulls in the public Executor.track() method used by the worker poll handler. Co-Authored-By: Claude Opus 4.7 (1M context) --- pixi.lock | 52 +++++++++++++++++++++++++------------------------- pyproject.toml | 2 +- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/pixi.lock b/pixi.lock index 6e770f6f..897021bd 100644 --- a/pixi.lock +++ b/pixi.lock @@ -161,7 +161,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/fe/3b/8ec5074bcfc450fe84273713b4b0a0dd47c0249358f5d82eb8104ffe2520/multidict-6.7.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/b2/f2/889ad4b2408f72fe1a4f6a19491177b30ea7bf1a0fd5f17050ca08cfc882/propcache-0.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/44/d6/c8b4f53f34e295e45709b7568bf9b9407a612ea30387d35eb9fa84f269b4/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/01/c330682e398978b2e151d9a6684826874e09b9319d565b6b6b73986093e5/x2s3-1.2.0-py3-none-any.whl @@ -314,7 +314,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/a7/f9/44d4b3064c65079d2467888794dea218d1601898ac50222ab8a9a8094460/multidict-6.7.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl - pypi: https://files.pythonhosted.org/packages/86/bd/47816020d337f4a746edc42fe8d53669965138f39ee117414c7d7a340cfe/propcache-0.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl - pypi: https://files.pythonhosted.org/packages/13/1e/98874ce72fd29cbde93209977b196a2edae03f8490d1bd8158e7f1daf3a0/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/01/c330682e398978b2e151d9a6684826874e09b9319d565b6b6b73986093e5/x2s3-1.2.0-py3-none-any.whl @@ -468,7 +468,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/d5/22/492f2246bb5b534abd44804292e81eeaf835388901f0c574bac4eeec73c5/multidict-6.7.1-cp314-cp314-macosx_10_15_x86_64.whl - pypi: https://files.pythonhosted.org/packages/65/9b/03b04e7d82a5f54fb16113d839f5ea1ede58a61e90edf515f6577c66fa8f/propcache-0.4.1-cp314-cp314-macosx_10_13_x86_64.whl - pypi: https://files.pythonhosted.org/packages/64/12/93ef0098590cf51d9732b4f139533732565704f45bdc1ffa741b7c95fb54/psycopg2_binary-2.9.11-cp314-cp314-macosx_10_13_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/01/c330682e398978b2e151d9a6684826874e09b9319d565b6b6b73986093e5/x2s3-1.2.0-py3-none-any.whl @@ -622,7 +622,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/f1/4f/733c48f270565d78b4544f2baddc2fb2a245e5a8640254b12c36ac7ac68e/multidict-6.7.1-cp314-cp314-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/b2/fa/89a8ef0468d5833a23fff277b143d0573897cf75bd56670a6d28126c7d68/propcache-0.4.1-cp314-cp314-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/7c/a9/9d55c614a891288f15ca4b5209b09f0f01e3124056924e17b81b9fa054cc/psycopg2_binary-2.9.11-cp314-cp314-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/01/c330682e398978b2e151d9a6684826874e09b9319d565b6b6b73986093e5/x2s3-1.2.0-py3-none-any.whl @@ -775,7 +775,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/e0/bf/52f25716bbe93745595800f36fb17b73711f14da59ed0bb2eba141bc9f0f/multidict-6.7.1-cp314-cp314-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/0c/2a/a758b47de253636e1b8aef181c0b4f4f204bf0dd964914fb2af90a95b49b/propcache-0.4.1-cp314-cp314-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/e1/36/9c0c326fe3a4227953dfb29f5d0c8ae3b8eb8c1cd2967aa569f50cb3c61f/psycopg2_binary-2.9.11-cp314-cp314-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/01/c330682e398978b2e151d9a6684826874e09b9319d565b6b6b73986093e5/x2s3-1.2.0-py3-none-any.whl @@ -986,7 +986,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/fe/3b/8ec5074bcfc450fe84273713b4b0a0dd47c0249358f5d82eb8104ffe2520/multidict-6.7.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/b2/f2/889ad4b2408f72fe1a4f6a19491177b30ea7bf1a0fd5f17050ca08cfc882/propcache-0.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/44/d6/c8b4f53f34e295e45709b7568bf9b9407a612ea30387d35eb9fa84f269b4/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/01/c330682e398978b2e151d9a6684826874e09b9319d565b6b6b73986093e5/x2s3-1.2.0-py3-none-any.whl @@ -1183,7 +1183,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/a7/f9/44d4b3064c65079d2467888794dea218d1601898ac50222ab8a9a8094460/multidict-6.7.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl - pypi: https://files.pythonhosted.org/packages/86/bd/47816020d337f4a746edc42fe8d53669965138f39ee117414c7d7a340cfe/propcache-0.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl - pypi: https://files.pythonhosted.org/packages/13/1e/98874ce72fd29cbde93209977b196a2edae03f8490d1bd8158e7f1daf3a0/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/01/c330682e398978b2e151d9a6684826874e09b9319d565b6b6b73986093e5/x2s3-1.2.0-py3-none-any.whl @@ -1376,7 +1376,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/d5/22/492f2246bb5b534abd44804292e81eeaf835388901f0c574bac4eeec73c5/multidict-6.7.1-cp314-cp314-macosx_10_15_x86_64.whl - pypi: https://files.pythonhosted.org/packages/65/9b/03b04e7d82a5f54fb16113d839f5ea1ede58a61e90edf515f6577c66fa8f/propcache-0.4.1-cp314-cp314-macosx_10_13_x86_64.whl - pypi: https://files.pythonhosted.org/packages/64/12/93ef0098590cf51d9732b4f139533732565704f45bdc1ffa741b7c95fb54/psycopg2_binary-2.9.11-cp314-cp314-macosx_10_13_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/01/c330682e398978b2e151d9a6684826874e09b9319d565b6b6b73986093e5/x2s3-1.2.0-py3-none-any.whl @@ -1569,7 +1569,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/f1/4f/733c48f270565d78b4544f2baddc2fb2a245e5a8640254b12c36ac7ac68e/multidict-6.7.1-cp314-cp314-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/b2/fa/89a8ef0468d5833a23fff277b143d0573897cf75bd56670a6d28126c7d68/propcache-0.4.1-cp314-cp314-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/7c/a9/9d55c614a891288f15ca4b5209b09f0f01e3124056924e17b81b9fa054cc/psycopg2_binary-2.9.11-cp314-cp314-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/01/c330682e398978b2e151d9a6684826874e09b9319d565b6b6b73986093e5/x2s3-1.2.0-py3-none-any.whl @@ -1762,7 +1762,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/e0/bf/52f25716bbe93745595800f36fb17b73711f14da59ed0bb2eba141bc9f0f/multidict-6.7.1-cp314-cp314-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/0c/2a/a758b47de253636e1b8aef181c0b4f4f204bf0dd964914fb2af90a95b49b/propcache-0.4.1-cp314-cp314-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/e1/36/9c0c326fe3a4227953dfb29f5d0c8ae3b8eb8c1cd2967aa569f50cb3c61f/psycopg2_binary-2.9.11-cp314-cp314-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/c6/01/c330682e398978b2e151d9a6684826874e09b9319d565b6b6b73986093e5/x2s3-1.2.0-py3-none-any.whl @@ -2011,7 +2011,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/fe/3b/8ec5074bcfc450fe84273713b4b0a0dd47c0249358f5d82eb8104ffe2520/multidict-6.7.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/b2/f2/889ad4b2408f72fe1a4f6a19491177b30ea7bf1a0fd5f17050ca08cfc882/propcache-0.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/44/d6/c8b4f53f34e295e45709b7568bf9b9407a612ea30387d35eb9fa84f269b4/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/84/47/07046e0acedc12fe2bae79cf6c73ad67f51ae9d67df64d06b0f3eac73d36/pytest_html-4.2.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3e/43/7e7b2ec865caa92f67b8f0e9231a798d102724ca4c0e1f414316be1c1ef2/pytest_metadata-3.1.1-py3-none-any.whl @@ -2252,7 +2252,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/a7/f9/44d4b3064c65079d2467888794dea218d1601898ac50222ab8a9a8094460/multidict-6.7.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl - pypi: https://files.pythonhosted.org/packages/86/bd/47816020d337f4a746edc42fe8d53669965138f39ee117414c7d7a340cfe/propcache-0.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl - pypi: https://files.pythonhosted.org/packages/13/1e/98874ce72fd29cbde93209977b196a2edae03f8490d1bd8158e7f1daf3a0/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/84/47/07046e0acedc12fe2bae79cf6c73ad67f51ae9d67df64d06b0f3eac73d36/pytest_html-4.2.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3e/43/7e7b2ec865caa92f67b8f0e9231a798d102724ca4c0e1f414316be1c1ef2/pytest_metadata-3.1.1-py3-none-any.whl @@ -2494,7 +2494,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/d5/22/492f2246bb5b534abd44804292e81eeaf835388901f0c574bac4eeec73c5/multidict-6.7.1-cp314-cp314-macosx_10_15_x86_64.whl - pypi: https://files.pythonhosted.org/packages/65/9b/03b04e7d82a5f54fb16113d839f5ea1ede58a61e90edf515f6577c66fa8f/propcache-0.4.1-cp314-cp314-macosx_10_13_x86_64.whl - pypi: https://files.pythonhosted.org/packages/64/12/93ef0098590cf51d9732b4f139533732565704f45bdc1ffa741b7c95fb54/psycopg2_binary-2.9.11-cp314-cp314-macosx_10_13_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/84/47/07046e0acedc12fe2bae79cf6c73ad67f51ae9d67df64d06b0f3eac73d36/pytest_html-4.2.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3e/43/7e7b2ec865caa92f67b8f0e9231a798d102724ca4c0e1f414316be1c1ef2/pytest_metadata-3.1.1-py3-none-any.whl @@ -2736,7 +2736,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/f1/4f/733c48f270565d78b4544f2baddc2fb2a245e5a8640254b12c36ac7ac68e/multidict-6.7.1-cp314-cp314-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/b2/fa/89a8ef0468d5833a23fff277b143d0573897cf75bd56670a6d28126c7d68/propcache-0.4.1-cp314-cp314-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/7c/a9/9d55c614a891288f15ca4b5209b09f0f01e3124056924e17b81b9fa054cc/psycopg2_binary-2.9.11-cp314-cp314-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/84/47/07046e0acedc12fe2bae79cf6c73ad67f51ae9d67df64d06b0f3eac73d36/pytest_html-4.2.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3e/43/7e7b2ec865caa92f67b8f0e9231a798d102724ca4c0e1f414316be1c1ef2/pytest_metadata-3.1.1-py3-none-any.whl @@ -2975,7 +2975,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/e0/bf/52f25716bbe93745595800f36fb17b73711f14da59ed0bb2eba141bc9f0f/multidict-6.7.1-cp314-cp314-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/0c/2a/a758b47de253636e1b8aef181c0b4f4f204bf0dd964914fb2af90a95b49b/propcache-0.4.1-cp314-cp314-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/e1/36/9c0c326fe3a4227953dfb29f5d0c8ae3b8eb8c1cd2967aa569f50cb3c61f/psycopg2_binary-2.9.11-cp314-cp314-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/84/47/07046e0acedc12fe2bae79cf6c73ad67f51ae9d67df64d06b0f3eac73d36/pytest_html-4.2.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3e/43/7e7b2ec865caa92f67b8f0e9231a798d102724ca4c0e1f414316be1c1ef2/pytest_metadata-3.1.1-py3-none-any.whl @@ -3229,7 +3229,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/f3/8d/5e5be3ced1d12966fefb5c4ea3b2a5b480afcea36406559442c6e31d4a48/multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/46/4b/3aae6835b8e5f44ea6a68348ad90f78134047b503765087be2f9912140ea/propcache-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - pypi: https://files.pythonhosted.org/packages/30/da/4e42788fb811bbbfd7b7f045570c062f49e350e1d1f3df056c3fb5763353/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/84/47/07046e0acedc12fe2bae79cf6c73ad67f51ae9d67df64d06b0f3eac73d36/pytest_html-4.2.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3e/43/7e7b2ec865caa92f67b8f0e9231a798d102724ca4c0e1f414316be1c1ef2/pytest_metadata-3.1.1-py3-none-any.whl @@ -3475,7 +3475,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/fd/80/c959c5933adedb9ac15152e4067c702a808ea183a8b64cf8f31af8ad3155/multidict-6.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl - pypi: https://files.pythonhosted.org/packages/9e/d3/6c7ee328b39a81ee877c962469f1e795f9db87f925251efeb0545e0020d0/propcache-0.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl - pypi: https://files.pythonhosted.org/packages/2d/75/364847b879eb630b3ac8293798e380e441a957c53657995053c5ec39a316/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/84/47/07046e0acedc12fe2bae79cf6c73ad67f51ae9d67df64d06b0f3eac73d36/pytest_html-4.2.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3e/43/7e7b2ec865caa92f67b8f0e9231a798d102724ca4c0e1f414316be1c1ef2/pytest_metadata-3.1.1-py3-none-any.whl @@ -3717,7 +3717,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl - pypi: https://files.pythonhosted.org/packages/76/47/8ccf75935f51448ba9a16a71b783eb7ef6b9ee60f5d14c7f8a8a79fbeed7/propcache-0.4.1-cp312-cp312-macosx_10_13_x86_64.whl - pypi: https://files.pythonhosted.org/packages/d8/91/f870a02f51be4a65987b45a7de4c2e1897dd0d01051e2b559a38fa634e3e/psycopg2_binary-2.9.11-cp312-cp312-macosx_10_13_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/84/47/07046e0acedc12fe2bae79cf6c73ad67f51ae9d67df64d06b0f3eac73d36/pytest_html-4.2.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3e/43/7e7b2ec865caa92f67b8f0e9231a798d102724ca4c0e1f414316be1c1ef2/pytest_metadata-3.1.1-py3-none-any.whl @@ -3959,7 +3959,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/0a/b6/5c9a0e42df4d00bfb4a3cbbe5cf9f54260300c88a0e9af1f47ca5ce17ac0/propcache-0.4.1-cp312-cp312-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/27/fa/cae40e06849b6c9a95eb5c04d419942f00d9eaac8d81626107461e268821/psycopg2_binary-2.9.11-cp312-cp312-macosx_11_0_arm64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/84/47/07046e0acedc12fe2bae79cf6c73ad67f51ae9d67df64d06b0f3eac73d36/pytest_html-4.2.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3e/43/7e7b2ec865caa92f67b8f0e9231a798d102724ca4c0e1f414316be1c1ef2/pytest_metadata-3.1.1-py3-none-any.whl @@ -4199,7 +4199,7 @@ environments: - pypi: https://files.pythonhosted.org/packages/80/31/0b2517913687895f5904325c2069d6a3b78f66cc641a86a2baf75a05dcbb/multidict-6.7.1-cp312-cp312-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/54/09/d19cff2a5aaac632ec8fc03737b223597b1e347416934c1b3a7df079784c/propcache-0.4.1-cp312-cp312-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/b1/d2/99b55e85832ccde77b211738ff3925a5d73ad183c0b37bcbbe5a8ff04978/psycopg2_binary-2.9.11-cp312-cp312-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/48/a2/87a0b61a3078be07ab04ec574ef6c683c764590ed0d2a50d00cbb23aeae7/pydantic_settings_yaml-0.2.0-py2.py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/84/47/07046e0acedc12fe2bae79cf6c73ad67f51ae9d67df64d06b0f3eac73d36/pytest_html-4.2.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/3e/43/7e7b2ec865caa92f67b8f0e9231a798d102724ca4c0e1f414316be1c1ef2/pytest_metadata-3.1.1-py3-none-any.whl @@ -6313,8 +6313,8 @@ packages: timestamp: 1760972937564 - pypi: ./ name: fileglancer - version: 2.7.0 - sha256: c8b222d4b8a585dfbb7ad278203f4adaa89479d1e7374021c6ea932a97fa0e7b + version: 2.8.1 + sha256: a41eaa22ff769b80dae43fb9f159a7ed90ac0c1839d67154056a8cabc1de6067 requires_dist: - alembic>=1.17.0 - atlassian-python-api>=4.0.7 @@ -6329,7 +6329,7 @@ packages: - packaging>=24.0 - pandas>=2.3.3 - psycopg2-binary>=2.9.10,<3 - - py-cluster-api>=0.5.0 + - py-cluster-api>=0.6.0 - pydantic-settings>=2.11.0 - pydantic>=2.10.6 - python-jose>=3.5.0,<4 @@ -11431,10 +11431,10 @@ packages: - pkg:pypi/pure-eval?source=hash-mapping size: 16668 timestamp: 1733569518868 -- pypi: https://files.pythonhosted.org/packages/b8/ff/386c969ebe70942b23136c469db51980ac770ebccd2316803b2a018d99db/py_cluster_api-0.5.0-py3-none-any.whl +- pypi: https://files.pythonhosted.org/packages/d8/96/9e5625a7c52e7c41fc8c1f89f40d8b4b29577016fa3c645fbfb42433fe83/py_cluster_api-0.6.0-py3-none-any.whl name: py-cluster-api - version: 0.5.0 - sha256: 4f0d1475ff39ec78d79fd8dde85d5ce2fe8146d446a34a713eca820a8de903a8 + version: 0.6.0 + sha256: 7be4d871ae6cc80adbbf33ab8f6616f0301281fede7802264b43948dbf929434 requires_dist: - pyyaml - build ; extra == 'release' diff --git a/pyproject.toml b/pyproject.toml index d8c4c1a2..e6cb24c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ dependencies = [ "packaging >=24.0", "uvicorn >=0.38.0", "x2s3 >=1.2.0", - "py-cluster-api >=0.5.0" + "py-cluster-api >=0.6.0" ] [project.scripts] From f79b4a367523244652bc1867221b36f3a80fdccb Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 19:59:05 -0400 Subject: [PATCH 32/39] cache user groups via getgrouplist instead of getgrall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit getgrall() enumerates every group on the system, which is very slow on LDAP/NIS hosts. getgrouplist() asks NSS what groups the user is in — on this dev box it's ~12x faster (2.9 ms vs 33.7 ms) for the same 27 groups, and the gap grows with the size of the directory. Result is cached per-username so repeat get_profile calls are free for the rest of the worker's lifetime. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/user_worker.py | 39 +++++++++++++++++++++++++++++--------- tests/test_endpoints.py | 3 ++- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index 8d768c3b..b427bbf2 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -154,6 +154,35 @@ def decorator(fn): # unmount/remount mid-session. _filestore_cache: dict[str, Any] = {} +# Per-username cache of supplementary group names. Keyed by username so the +# in-process dev/test path (which serves multiple users from one process) +# stays correct; in subprocess mode there's only ever one entry. +_user_groups_cache: dict[str, list[str]] = {} + + +def _get_user_groups(username: str) -> list[str]: + """Return the supplementary group names for a user. + + Uses os.getgrouplist (NSS initgroups) instead of grp.getgrall, which + enumerates every group on the system and is very slow on LDAP/NIS hosts. + Result is cached for the lifetime of the process. + """ + cached = _user_groups_cache.get(username) + if cached is not None: + return cached + + import grp as _grp + pw = pwd.getpwnam(username) + gids = os.getgrouplist(username, pw.pw_gid) + names = [] + for gid in gids: + try: + names.append(_grp.getgrgid(gid).gr_name) + except KeyError: + continue + _user_groups_cache[username] = names + return names + def _get_filestore(fsp_name: str, db_url: str): """Look up a FileSharePath and return a Filestore instance. @@ -555,15 +584,7 @@ def _action_get_profile(request: dict, ctx: WorkerContext) -> dict: user_groups = [] try: - import grp as _grp - user_info = pwd.getpwnam(username) - all_groups = _grp.getgrall() - for group in all_groups: - if username in group.gr_mem: - user_groups.append(group.gr_name) - primary_group = _grp.getgrgid(user_info.pw_gid).gr_name - if primary_group not in user_groups: - user_groups.append(primary_group) + user_groups = _get_user_groups(username) except Exception as e: logger.error(f"Error getting groups for user {username}: {e}") diff --git a/tests/test_endpoints.py b/tests/test_endpoints.py index fbdaeb1a..89f67d73 100644 --- a/tests/test_endpoints.py +++ b/tests/test_endpoints.py @@ -87,8 +87,9 @@ def test_app(temp_dir): # Clear the per-process filestore cache so subsequent tests don't see # stale Filestore instances pointing at this test's temp directory - from fileglancer.user_worker import _filestore_cache + from fileglancer.user_worker import _filestore_cache, _user_groups_cache _filestore_cache.clear() + _user_groups_cache.clear() # Restore original get_settings and clear cache fileglancer.settings.get_settings = original_get_settings From 2a34beafcc6a49be64bd483e923fcc3bd1c3ee1e Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 20:00:59 -0400 Subject: [PATCH 33/39] log stderr forwarder crashes instead of swallowing them If the forwarder dies silently, the worker's stderr pipe eventually fills and blocks the worker on its next write. logger.exception preserves the traceback so the cause is recoverable. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/worker_pool.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index ea0e9ab1..2481e43b 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -331,7 +331,12 @@ async def _spawn_worker(self, username: str) -> UserWorker: return UserWorker(username, process, parent_sock) async def _forward_stderr(self, username: str, process: subprocess.Popen): - """Forward worker stderr lines to loguru in the background.""" + """Forward worker stderr lines to loguru in the background. + + If this task dies, the worker's stderr pipe will eventually fill and + block the worker on its next write — so failures here are logged + loudly rather than swallowed. + """ try: loop = asyncio.get_event_loop() while True: @@ -340,7 +345,7 @@ async def _forward_stderr(self, username: str, process: subprocess.Popen): break logger.debug(f"[worker:{username}] {line.decode().rstrip()}") except Exception: - pass + logger.exception(f"stderr forwarder for worker {username} crashed") async def _evict_lru(self): """Evict the least-recently-used idle worker.""" From 169554daa119def311e32208a8f3ba0817e03048 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 20:11:45 -0400 Subject: [PATCH 34/39] extract find_fsp_in_paths and replace filestore session= with fsps= find_fsp_in_paths is a pure function over the FSP list, so callers don't need a DB session for symlink target resolution. The Filestore methods that previously took a session= for that purpose now take fsps=. Tests that mocked database.find_fsp_from_absolute_path can now pass the real list and let find_fsp_in_paths do the work. Foundation for moving DB access out of the worker subprocess. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/database.py | 40 +++++++--- fileglancer/filestore.py | 46 ++++++----- tests/test_filestore.py | 168 +++++++++++---------------------------- 3 files changed, 98 insertions(+), 156 deletions(-) diff --git a/fileglancer/database.py b/fileglancer/database.py index 98ec46c0..4a52ca3f 100644 --- a/fileglancer/database.py +++ b/fileglancer/database.py @@ -576,27 +576,24 @@ def _find_best_fsp_match( return (best_fsp, subpath) -def find_fsp_from_absolute_path(session: Session, absolute_path: str) -> Optional[tuple[FileSharePath, str]]: - """ - Find the file share path that exactly matches the given absolute path. +def find_fsp_in_paths( + paths: list[FileSharePath], absolute_path: str +) -> Optional[tuple[FileSharePath, str]]: + """Match *absolute_path* against an in-memory list of file share paths. - This function iterates through all file share paths and checks if the absolute - path exists within any of them. Returns the first exact match found. + Pure function with no DB access — useful from contexts that already have + the path list (e.g. a worker subprocess that fetched it once and cached + it). Args: - session: Database session - absolute_path: Absolute file path to match against file shares + paths: All file share paths to search. + absolute_path: Absolute file path to match. Returns: - Tuple of (FileSharePath, relative_subpath) if an exact match is found, None otherwise + ``(fsp, relative_subpath)`` for the longest match, or *None*. """ - # Resolve symlinks in the input path (e.g., /var -> /private/var on macOS) normalized_path = os.path.realpath(absolute_path) - # Get all file share paths - paths = get_file_share_paths(session) - - # Pre-compute expanded mount paths so the helper can use them expanded_mounts: dict[str, str] = {} for fsp in paths: expanded = os.path.expanduser(fsp.mount_path) @@ -612,6 +609,23 @@ def _expanded_mount(fsp: FileSharePath): return result +def find_fsp_from_absolute_path(session: Session, absolute_path: str) -> Optional[tuple[FileSharePath, str]]: + """ + Find the file share path that exactly matches the given absolute path. + + This function iterates through all file share paths and checks if the absolute + path exists within any of them. Returns the first exact match found. + + Args: + session: Database session + absolute_path: Absolute file path to match against file shares + + Returns: + Tuple of (FileSharePath, relative_subpath) if an exact match is found, None otherwise + """ + return find_fsp_in_paths(get_file_share_paths(session), absolute_path) + + def _validate_proxied_path(session: Session, fsp_name: str, path: str) -> None: """Validate a proxied path exists and is accessible""" # Get mount path - check database first using existing session, then check local mounts diff --git a/fileglancer/filestore.py b/fileglancer/filestore.py index f397512c..9590ec8c 100644 --- a/fileglancer/filestore.py +++ b/fileglancer/filestore.py @@ -17,7 +17,7 @@ from typing import Optional, Generator from loguru import logger -from .database import find_fsp_from_absolute_path +from .database import find_fsp_in_paths from .model import FileSharePath # Default buffer size for streaming file contents @@ -84,15 +84,15 @@ def _safe_readlink(path: str, root_path: Optional[str] = None) -> Optional[str]: return None @classmethod - def _get_symlink_target_fsp(cls, absolute_path: str, is_symlink: bool, session, - root_path: Optional[str]) -> Optional[dict]: + def _get_symlink_target_fsp(cls, absolute_path: str, is_symlink: bool, + fsps: Optional[list], root_path: Optional[str]) -> Optional[dict]: """ Resolve a symlink target to a file share path. Returns a dict with fsp_name and subpath if the target is in a known file share, or None if not a symlink, target not found, or target not in any file share. """ - if not is_symlink or session is None: + if not is_symlink or not fsps: return None # Read the symlink target safely @@ -107,7 +107,7 @@ def _get_symlink_target_fsp(cls, absolute_path: str, is_symlink: bool, session, # Try to find which file share contains this target try: - match = find_fsp_from_absolute_path(session, target) + match = find_fsp_in_paths(fsps, target) if match: fsp, subpath = match @@ -133,7 +133,7 @@ def _get_symlink_target_fsp(cls, absolute_path: str, is_symlink: bool, session, @classmethod def from_stat(cls, path: str, absolute_path: str, lstat_result: os.stat_result, stat_result: os.stat_result, - current_user: str = None, session = None, + current_user: str = None, fsps: Optional[list] = None, root_path: Optional[str] = None, user_groups: Optional[set[str]] = None): """ @@ -145,7 +145,7 @@ def from_stat(cls, path: str, absolute_path: str, lstat_result: Result of os.lstat() on the path (detects symlinks). stat_result: Result of os.stat() or lstat for broken symlinks. current_user: Username for permission checking (optional). - session: Database session for symlink resolution (optional). + fsps: List of FileSharePath objects for symlink target resolution (optional). root_path: Filestore root for defense-in-depth validation in symlink reading (optional). user_groups: Pre-computed user group set to avoid per-file getgrall() (optional). """ @@ -177,7 +177,7 @@ def from_stat(cls, path: str, absolute_path: str, hasRead, hasWrite = cls._check_permissions(stat_result, current_user, owner, group, user_groups) # Resolve symlink target to file share path if applicable - symlink_target_fsp = cls._get_symlink_target_fsp(absolute_path, is_symlink, session, root_path) + symlink_target_fsp = cls._get_symlink_target_fsp(absolute_path, is_symlink, fsps, root_path) return cls( name=name, @@ -280,7 +280,8 @@ def _check_path_in_root(self, path: Optional[str]) -> str: return full_path - def _get_file_info_from_path(self, full_path: str, current_user: str = None, session = None, + def _get_file_info_from_path(self, full_path: str, current_user: str = None, + fsps: Optional[list] = None, user_groups: Optional[set[str]] = None) -> FileInfo: """ Get the FileInfo for a file or directory at the given path. @@ -348,13 +349,14 @@ def _is_within_root(p: str) -> bool: return FileInfo.from_stat( rel_path, full_path, lstat_result, stat_result, - current_user=current_user, session=session, + current_user=current_user, fsps=fsps, root_path=self.root_path, user_groups=user_groups, ) - def _file_info_from_direntry(self, entry: os.DirEntry, current_user: str = None, session = None, + def _file_info_from_direntry(self, entry: os.DirEntry, current_user: str = None, + fsps: Optional[list] = None, user_groups: Optional[set[str]] = None) -> FileInfo: """Build a FileInfo from a DirEntry, using entry.stat() instead of os.lstat/os.stat. @@ -387,7 +389,7 @@ def _file_info_from_direntry(self, entry: os.DirEntry, current_user: str = None, return FileInfo.from_stat( rel_path, full_path, lstat_result, stat_result, - current_user=current_user, session=session, + current_user=current_user, fsps=fsps, root_path=self.root_path, user_groups=user_groups, ) @@ -430,7 +432,8 @@ def get_absolute_path(self, relative_path: Optional[str] = None) -> str: return os.path.abspath(os.path.join(self.root_path, relative_path)) - def get_file_info(self, path: Optional[str] = None, current_user: str = None, session = None) -> FileInfo: + def get_file_info(self, path: Optional[str] = None, current_user: str = None, + fsps: Optional[list] = None) -> FileInfo: """ Get the FileInfo for a file or directory at the given path. @@ -439,7 +442,7 @@ def get_file_info(self, path: Optional[str] = None, current_user: str = None, se May be None, in which case the root directory is used. current_user (str): The username of the current user for permission checking. May be None, in which case hasRead and hasWrite will be None. - session: Database session for symlink resolution. + fsps: List of FileSharePath objects for symlink target resolution. May be None, in which case symlink_target_fsp will be None. Raises: @@ -449,7 +452,7 @@ def get_file_info(self, path: Optional[str] = None, current_user: str = None, se full_path = self.root_path else: full_path = os.path.join(self.root_path, path) - return self._get_file_info_from_path(full_path, current_user, session) + return self._get_file_info_from_path(full_path, current_user, fsps) def check_is_binary(self, path: Optional[str] = None, sample_size: int = 4096) -> bool: @@ -489,7 +492,7 @@ def check_is_binary(self, path: Optional[str] = None, sample_size: int = 4096) - def yield_file_infos_paginated(self, path: Optional[str] = None, current_user: str = None, - session = None, limit: int = 200, + fsps: Optional[list] = None, limit: int = 200, cursor: Optional[str] = None) -> tuple[list[FileInfo], bool, Optional[str], int]: """ Return a page of FileInfo objects for children of the given path. @@ -501,7 +504,7 @@ def yield_file_infos_paginated(self, path: Optional[str] = None, current_user: s Args: path: Relative path to the directory to list. current_user: Username for permission checking. - session: Database session for symlink resolution. + fsps: List of FileSharePath objects for symlink target resolution. limit: Maximum number of entries to return. cursor: Name of the last entry from the previous page. Entries after this name (in sort order) are returned. @@ -541,7 +544,7 @@ def yield_file_infos_paginated(self, path: Optional[str] = None, current_user: s for entry in page_entries: try: file_infos.append( - self._file_info_from_direntry(entry, current_user, session, user_groups) + self._file_info_from_direntry(entry, current_user, fsps, user_groups) ) except PermissionError as e: logger.error(f"Permission denied accessing entry: {entry.path}: {e}") @@ -550,7 +553,8 @@ def yield_file_infos_paginated(self, path: Optional[str] = None, current_user: s next_cursor = page_entries[-1].name if has_more and page_entries else None return file_infos, has_more, next_cursor, total_count - def yield_file_infos(self, path: Optional[str] = None, current_user: str = None, session = None) -> Generator[FileInfo, None, None]: + def yield_file_infos(self, path: Optional[str] = None, current_user: str = None, + fsps: Optional[list] = None) -> Generator[FileInfo, None, None]: """ Yield a FileInfo object for each child of the given path. @@ -559,7 +563,7 @@ def yield_file_infos(self, path: Optional[str] = None, current_user: str = None, May be None, in which case the root directory is listed. current_user (str): The username of the current user for permission checking. May be None, in which case hasRead and hasWrite will be None. - session: Database session for symlink resolution. + fsps: List of FileSharePath objects for symlink target resolution. May be None, in which case symlink_target_fsp will be None for symlinks. Raises: @@ -577,7 +581,7 @@ def yield_file_infos(self, path: Optional[str] = None, current_user: str = None, entries.sort(key=lambda e: (not e.is_dir(follow_symlinks=False), e.name)) for entry in entries: try: - yield self._file_info_from_direntry(entry, current_user, session, user_groups) + yield self._file_info_from_direntry(entry, current_user, fsps, user_groups) except PermissionError as e: # Skip files we don't have permission to access logger.error(f"Permission denied accessing entry: {entry.path}: {e}") diff --git a/tests/test_filestore.py b/tests/test_filestore.py index 53765afc..18997f61 100644 --- a/tests/test_filestore.py +++ b/tests/test_filestore.py @@ -5,12 +5,9 @@ import tempfile import shutil -from contextlib import contextmanager -from unittest.mock import Mock, MagicMock, patch -from typing import List, Callable, Optional +from unittest.mock import MagicMock, patch from conftest import requires_symlinks -from fileglancer import database from fileglancer.filestore import Filestore, FileInfo from fileglancer.model import FileSharePath @@ -46,36 +43,6 @@ def filestore(test_dir): return Filestore(file_share_path) -@contextmanager -def mock_database_for_symlinks(file_share_paths: List[FileSharePath], - find_fsp_func: Optional[Callable] = None): - """ - Context manager to mock database functions for symlink resolution tests. - - Args: - file_share_paths: List of FileSharePath objects to return from get_file_share_paths - find_fsp_func: Optional custom function for find_fsp_from_absolute_path. - If None, the original function is preserved (not mocked). - """ - original_get_paths = database.get_file_share_paths - original_find = database.find_fsp_from_absolute_path - - def mock_get_file_share_paths(session, fsp_name=None): - paths = file_share_paths - if fsp_name: - paths = [p for p in paths if p.name == fsp_name] - return paths - - try: - database.get_file_share_paths = mock_get_file_share_paths - if find_fsp_func is not None: - database.find_fsp_from_absolute_path = find_fsp_func - yield - finally: - database.get_file_share_paths = original_get_paths - database.find_fsp_from_absolute_path = original_find - - def test_unmounted_filestore(): test_dir = "/not/a/real/path" file_share_path = FileSharePath(zone="test", name="test", mount_path=test_dir) @@ -270,25 +237,17 @@ def test_same_share_symlink_resolution_via_listing(filestore, test_dir): symlink_path = os.path.join(test_dir, "link_to_subdir_file") os.symlink(target_file, symlink_path) - mock_session = Mock() fsp = FileSharePath(zone="test", name="test", mount_path=test_dir) - def mock_find(session, path): - # Normalize paths for comparison - if os.path.realpath(path) == os.path.realpath(target_file): - return (fsp, "subdir/target_same_share.txt") - return None - - with mock_database_for_symlinks([fsp], mock_find): - # Use yield_file_infos to list directory - symlinks are detected this way - files = list(filestore.yield_file_infos("", session=mock_session)) - symlink_info = next((f for f in files if f.name == "link_to_subdir_file"), None) + # Use yield_file_infos to list directory - symlinks are detected this way + files = list(filestore.yield_file_infos("", fsps=[fsp])) + symlink_info = next((f for f in files if f.name == "link_to_subdir_file"), None) - assert symlink_info is not None - assert symlink_info.is_symlink is True - assert symlink_info.symlink_target_fsp is not None - assert symlink_info.symlink_target_fsp["fsp_name"] == "test" - assert symlink_info.symlink_target_fsp["subpath"] == "subdir/target_same_share.txt" + assert symlink_info is not None + assert symlink_info.is_symlink is True + assert symlink_info.symlink_target_fsp is not None + assert symlink_info.symlink_target_fsp["fsp_name"] == "test" + assert symlink_info.symlink_target_fsp["subpath"] == "subdir/target_same_share.txt" @requires_symlinks @@ -314,24 +273,15 @@ def test_cross_share_symlink_resolution_via_listing(test_dir): fsp2 = FileSharePath(zone="test", name="share2", mount_path=share2_dir) filestore1 = Filestore(fsp1) - mock_session = Mock() - - def mock_find(session, path): - # Normalize paths for comparison - if os.path.realpath(path) == os.path.realpath(target_file): - return (fsp2, "target.txt") - return None - - with mock_database_for_symlinks([fsp1, fsp2], mock_find): - # Use yield_file_infos to list directory - symlinks are detected this way - files = list(filestore1.yield_file_infos("", session=mock_session)) - symlink_info = next((f for f in files if f.name == "link_to_share2"), None) + # Use yield_file_infos to list directory - symlinks are detected this way + files = list(filestore1.yield_file_infos("", fsps=[fsp1, fsp2])) + symlink_info = next((f for f in files if f.name == "link_to_share2"), None) - assert symlink_info is not None - assert symlink_info.is_symlink is True - assert symlink_info.symlink_target_fsp is not None - assert symlink_info.symlink_target_fsp["fsp_name"] == "share2" - assert symlink_info.symlink_target_fsp["subpath"] == "target.txt" + assert symlink_info is not None + assert symlink_info.is_symlink is True + assert symlink_info.symlink_target_fsp is not None + assert symlink_info.symlink_target_fsp["fsp_name"] == "share2" + assert symlink_info.symlink_target_fsp["subpath"] == "target.txt" @requires_symlinks @@ -353,23 +303,14 @@ def test_relative_symlink_resolution(test_dir): fsp_rel = FileSharePath(zone="test", name="rel_test", mount_path=os.path.join(test_dir, "rel_test")) nested_filestore = Filestore(fsp) - mock_session = Mock() + # List directory to find the symlink + files = list(nested_filestore.yield_file_infos("", fsps=[fsp, fsp_rel])) + symlink_info = next((f for f in files if f.name == "link"), None) - def mock_find(session, path): - if os.path.realpath(path) == os.path.realpath(target_file): - # Return fsp for rel_test directory - return (fsp_rel, "target.txt") - return None - - with mock_database_for_symlinks([fsp, fsp_rel], mock_find): - # List directory to find the symlink - files = list(nested_filestore.yield_file_infos("", session=mock_session)) - symlink_info = next((f for f in files if f.name == "link"), None) - - assert symlink_info is not None - assert symlink_info.is_symlink is True - assert symlink_info.symlink_target_fsp is not None - assert symlink_info.symlink_target_fsp["subpath"] == "target.txt" + assert symlink_info is not None + assert symlink_info.is_symlink is True + assert symlink_info.symlink_target_fsp is not None + assert symlink_info.symlink_target_fsp["subpath"] == "target.txt" @requires_symlinks @@ -384,16 +325,14 @@ def test_yield_file_infos_with_symlinks(filestore, test_dir): os.path.join(test_dir, "link1") ) - mock_session = Mock() fsp = FileSharePath(zone="test", name="test", mount_path=test_dir) - with mock_database_for_symlinks([fsp], lambda s, p: (fsp, "file1.txt")): - files = list(filestore.yield_file_infos("", session=mock_session)) + files = list(filestore.yield_file_infos("", fsps=[fsp])) - # Find the symlink in the list - symlink_info = next((f for f in files if f.name == "link1"), None) - assert symlink_info is not None - assert symlink_info.is_symlink is True + # Find the symlink in the list + symlink_info = next((f for f in files if f.name == "link1"), None) + assert symlink_info is not None + assert symlink_info.is_symlink is True @requires_symlinks @@ -432,18 +371,16 @@ def test_symlink_to_directory(filestore, test_dir): symlink_path = os.path.join(test_dir, "link_to_dir") os.symlink(target_dir, symlink_path) - mock_session = Mock() fsp = FileSharePath(zone="test", name="test", mount_path=test_dir) - with mock_database_for_symlinks([fsp], lambda s, p: (fsp, "target_dir")): - # Use yield_file_infos to list directory - symlinks to dirs are detected this way - files = list(filestore.yield_file_infos("", session=mock_session)) - symlink_info = next((f for f in files if f.name == "link_to_dir"), None) + # Use yield_file_infos to list directory - symlinks to dirs are detected this way + files = list(filestore.yield_file_infos("", fsps=[fsp])) + symlink_info = next((f for f in files if f.name == "link_to_dir"), None) - assert symlink_info is not None - assert symlink_info.is_symlink is True - assert symlink_info.is_dir is True # Should also be marked as directory - assert symlink_info.symlink_target_fsp is not None + assert symlink_info is not None + assert symlink_info.is_symlink is True + assert symlink_info.is_dir is True # Should also be marked as directory + assert symlink_info.symlink_target_fsp is not None @requires_symlinks @@ -476,29 +413,16 @@ def test_broken_symlink_within_share(test_dir): missing_target = os.path.join(test_dir, "subdir", "nonexistent.txt") os.symlink(missing_target, broken_link_path) - mock_session = Mock() - - def mock_find(session, path): - # This would normally match the file share pattern, - # but since the target doesn't exist, we should not get here - # because os.path.exists check should return False first - normalized_path = os.path.realpath(path) - test_dir_real = os.path.realpath(test_dir) - if normalized_path.startswith(test_dir_real): - return (fsp, os.path.relpath(normalized_path, test_dir_real)) - return None - - with mock_database_for_symlinks([fsp], mock_find): - # Get file infos with session (so symlink resolution is attempted) - file_infos = list(filestore.yield_file_infos("", session=mock_session)) - - # Find the broken symlink - broken_link_info = next((f for f in file_infos if f.name == "link_to_missing_file"), None) - - # Verify the symlink is detected but target is not resolved - assert broken_link_info is not None, "Broken symlink should be listed" - assert broken_link_info.is_symlink is True, "Should be marked as symlink" - assert broken_link_info.symlink_target_fsp is None, "symlink_target_fsp should be None for broken symlink even if target path matches share pattern" + # Get file infos with fsps (so symlink resolution is attempted) + file_infos = list(filestore.yield_file_infos("", fsps=[fsp])) + + # Find the broken symlink + broken_link_info = next((f for f in file_infos if f.name == "link_to_missing_file"), None) + + # Verify the symlink is detected but target is not resolved + assert broken_link_info is not None, "Broken symlink should be listed" + assert broken_link_info.is_symlink is True, "Should be marked as symlink" + assert broken_link_info.symlink_target_fsp is None, "symlink_target_fsp should be None for broken symlink even if target path matches share pattern" # Filestore.validate_path() tests From 7fc2e34118d50418053fd010f4dee86e6b6ae387 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 20:22:29 -0400 Subject: [PATCH 35/39] proxy worker DB queries through the parent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workers no longer receive FGC_DB_URL. Any DB access from a worker handler now goes through ctx.db (a DbProxy). In a subprocess, that's RpcDbProxy, which sends a db_request message back over the same socket; the parent's UserWorker._handle_db_request runs the query with full credentials and sends a db_response. In dev/test mode the same handlers use LocalDbProxy to call the database directly. The wire protocol picks up two new message kinds (db_request, db_response) distinguished by a "_kind" field; existing action requests/responses keep their old shape. The whitelist of allowed db methods lives in user_worker.DB_METHODS — currently get_file_share_paths and get_job, both read-only and narrowly scoped. Filestore session= and validate_path_in_filestore(session) were already refactored to fsps= in earlier commits; this commit threads ctx.db through the worker handlers and removes the last db.get_db_session(ctx.db_url) calls. read_job_file is split out of get_job_file_content so the worker can read job files without re-doing the DB lookup. Closes the threat where a user could read /proc//environ to recover DB credentials and access other users' session tokens etc. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/apps/core.py | 75 ++++---- fileglancer/server.py | 6 +- fileglancer/user_worker.py | 357 +++++++++++++++++++++++-------------- fileglancer/worker_pool.py | 76 ++++++-- tests/test_apps.py | 15 +- tests/test_worker.py | 15 +- 6 files changed, 345 insertions(+), 199 deletions(-) diff --git a/fileglancer/apps/core.py b/fileglancer/apps/core.py index ba520d85..4f71e956 100644 --- a/fileglancer/apps/core.py +++ b/fileglancer/apps/core.py @@ -501,12 +501,12 @@ def validate_path_for_shell(path_value: str) -> str | None: return None -def validate_path_in_filestore(path_value: str, session) -> str | None: +def validate_path_in_filestore(path_value: str, fsps: list) -> str | None: """Validate a path exists and is readable within an allowed file share. - Performs syntax checks, then resolves the path against known file share - mounts via the database. Returns an error message string if invalid, - or None if valid. + Performs syntax checks, then resolves the path against the given list of + file share paths. Returns an error message string if invalid, or None if + valid. """ # Syntax check first error = validate_path_for_shell(path_value) @@ -523,8 +523,8 @@ def validate_path_in_filestore(path_value: str, session) -> str | None: expanded = os.path.expanduser(normalized) # Resolve to a file share path - from fileglancer.database import find_fsp_from_absolute_path - result = find_fsp_from_absolute_path(session, expanded) + from fileglancer.database import find_fsp_in_paths + result = find_fsp_in_paths(fsps, expanded) if result is None: return "Path is not within an allowed file share" @@ -599,7 +599,8 @@ def _validate_parameter_value(param: AppParameter, value, session=None) -> str: home = home.replace("\\", "/") str_val = home + str_val[1:] if session is not None: - error = validate_path_in_filestore(str_val, session) + fsps = db.get_file_share_paths(session) + error = validate_path_in_filestore(str_val, fsps) else: error = validate_path_for_shell(str_val) if error: @@ -1474,19 +1475,27 @@ def _resolve_work_dir(db_job: db.JobDB) -> Path: return _build_work_dir(db_job.id, db_job.app_name, db_job.entry_point_id) -def _resolve_browse_path(abs_path: str) -> tuple[str | None, str | None]: - """Resolve an absolute path to an FSP name and subpath for browse links.""" - settings = get_settings() - with db.get_db_session(settings.db_url) as session: - result = db.find_fsp_from_absolute_path(session, abs_path) +def _resolve_browse_path(abs_path: str, fsps: Optional[list] = None) -> tuple[str | None, str | None]: + """Resolve an absolute path to an FSP name and subpath for browse links. + + If *fsps* is None, queries the database directly. Pass a pre-fetched list + to avoid the DB hit (used from the worker subprocess, which has no DB + access). + """ + if fsps is None: + settings = get_settings() + with db.get_db_session(settings.db_url) as session: + result = db.find_fsp_from_absolute_path(session, abs_path) + else: + result = db.find_fsp_in_paths(fsps, abs_path) if result: return result[0].name, result[1] return None, None -def _make_file_info(file_path: str, exists: bool) -> dict: +def _make_file_info(file_path: str, exists: bool, fsps: Optional[list] = None) -> dict: """Create a file info dict with browse link resolution.""" - fsp_name, subpath = _resolve_browse_path(file_path) if exists else (None, None) + fsp_name, subpath = _resolve_browse_path(file_path, fsps) if exists else (None, None) return { "path": file_path, "exists": exists, @@ -1524,10 +1533,11 @@ def get_service_url(db_job: db.JobDB) -> Optional[str]: return url -def get_job_file_paths(db_job: db.JobDB) -> dict[str, dict]: +def get_job_file_paths(db_job: db.JobDB, fsps: Optional[list] = None) -> dict[str, dict]: """Return file path info for a job's files (script, stdout, stderr, service_url). - Returns a dict keyed by file type with path and existence info. + Returns a dict keyed by file type with path and existence info. Pass *fsps* + to skip the per-file DB lookup needed for browse-link resolution. """ work_dir = _resolve_work_dir(db_job) @@ -1539,21 +1549,21 @@ def get_job_file_paths(db_job: db.JobDB) -> dict[str, dict]: stderr_path = work_dir / "stderr.log" files = { - "script": _make_file_info(script_path, len(scripts) > 0), - "stdout": _make_file_info(str(stdout_path), stdout_path.is_file()), - "stderr": _make_file_info(str(stderr_path), stderr_path.is_file()), + "script": _make_file_info(script_path, len(scripts) > 0, fsps), + "stdout": _make_file_info(str(stdout_path), stdout_path.is_file(), fsps), + "stderr": _make_file_info(str(stderr_path), stderr_path.is_file(), fsps), } # Include service_url file info for service-type jobs if getattr(db_job, 'entry_point_type', 'job') == 'service': service_url_path = work_dir / "service_url" - files["service_url"] = _make_file_info(str(service_url_path), service_url_path.is_file()) + files["service_url"] = _make_file_info(str(service_url_path), service_url_path.is_file(), fsps) return files -def get_job_file_content(job_id: int, username: str, file_type: str) -> Optional[str]: - """Read the content of a job file (script, stdout, or stderr). +def read_job_file(db_job, file_type: str) -> Optional[str]: + """Read the content of a job file given a loaded job record. All job files live in the job's work directory: - *.sh — the generated script (written by cluster-api) @@ -1562,14 +1572,6 @@ def get_job_file_content(job_id: int, username: str, file_type: str) -> Optional Returns the file content as a string, or None if the file doesn't exist. """ - settings = get_settings() - - with db.get_db_session(settings.db_url) as session: - db_job = db.get_job(session, job_id, username) - if db_job is None: - raise ValueError(f"Job {job_id} not found") - session.expunge(db_job) - work_dir = _resolve_work_dir(db_job) if file_type == "script": @@ -1588,3 +1590,16 @@ def get_job_file_content(job_id: int, username: str, file_type: str) -> Optional if path.is_file(): return path.read_text() return None + + +def get_job_file_content(job_id: int, username: str, file_type: str) -> Optional[str]: + """Read job file by id+username (does its own DB lookup).""" + settings = get_settings() + + with db.get_db_session(settings.db_url) as session: + db_job = db.get_job(session, job_id, username) + if db_job is None: + raise ValueError(f"Job {job_id} not found") + session.expunge(db_job) + + return read_job_file(db_job, file_type) diff --git a/fileglancer/server.py b/fileglancer/server.py index 297393e4..c6d19e97 100644 --- a/fileglancer/server.py +++ b/fileglancer/server.py @@ -248,16 +248,16 @@ async def _worker_exec(username: str, action: str, **kwargs): raise HTTPException(status_code=e.status_code, detail=str(e)) else: # Dev/test mode: run action directly in-process - from fileglancer.user_worker import _ACTIONS, WorkerContext + from fileglancer.user_worker import _ACTIONS, WorkerContext, LocalDbProxy handler = _ACTIONS.get(action) if handler is None: raise HTTPException(status_code=500, detail=f"Unknown action: {action}") - ctx = WorkerContext(username=username, db_url=settings.db_url) + ctx = WorkerContext(username=username, db=LocalDbProxy(settings.db_url)) request = {"action": action, **kwargs} try: result = handler(request, ctx) except Exception as e: - logger.error(f"Action handler error for {username} action={action}: {e}") + logger.exception(f"Action handler error for {username} action={action}: {e}") raise HTTPException(status_code=500, detail=str(e)) # Strip the raw fd (not meaningful in-process), keep _file_handle result.pop("_fd", None) diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index b427bbf2..7f632e29 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -144,6 +144,113 @@ def decorator(fn): return decorator +# --------------------------------------------------------------------------- +# DB proxy +# --------------------------------------------------------------------------- +# +# Worker subprocesses run as the (untrusted) target user, so we don't give +# them the database URL. Instead, action handlers go through a DbProxy that +# reverse-RPCs back to the parent over the same socket. The parent runs the +# query with full credentials and returns the result. +# +# In dev/test mode (no subprocess), the same handlers use LocalDbProxy which +# calls the database functions directly. +# +# DB_METHODS is the whitelist of methods both proxies expose; the parent's +# inbound dispatch only accepts these names. + +from types import SimpleNamespace + + +def _job_db_to_dict(j) -> dict: + """Serialize a JobDB row to a JSON-safe dict for transport to the worker. + + Only includes fields used by worker-side handlers (read_job_file, + get_job_file_paths, get_service_url) — keep this list minimal so the + worker sees as little of the DB row as possible. + """ + return { + "id": j.id, + "app_name": j.app_name, + "entry_point_id": j.entry_point_id, + "entry_point_type": getattr(j, "entry_point_type", "job"), + "status": j.status, + "work_dir": j.work_dir, + } + + +class LocalDbProxy: + """DbProxy backed by a real database connection. + + Used in dev/test mode (in-process) and on the parent side as the + backend for inbound db_request messages from worker subprocesses. + """ + + def __init__(self, db_url: str): + self.db_url = db_url + + def get_file_share_paths(self): + from fileglancer import database as db + with db.get_db_session(self.db_url) as session: + return db.get_file_share_paths(session) + + def get_job(self, job_id: int, username: str): + from fileglancer import database as db + with db.get_db_session(self.db_url) as session: + j = db.get_job(session, job_id, username) + if j is None: + return None + return SimpleNamespace(**_job_db_to_dict(j)) + + +class RpcDbProxy: + """DbProxy that reverse-RPCs each call back to the parent over the socket.""" + + def __init__(self, sock: socket.socket): + self.sock = sock + + def _call(self, method: str, **kwargs): + _send(self.sock, {"_kind": "db_request", "method": method, "kwargs": kwargs}) + resp = _recv(self.sock) + if resp.get("_kind") != "db_response": + raise RuntimeError(f"Expected db_response, got: {resp!r}") + if not resp.get("ok"): + raise RuntimeError(resp.get("error", "DB request failed")) + return resp.get("result") + + def get_file_share_paths(self): + from fileglancer.model import FileSharePath + rows = self._call("get_file_share_paths") or [] + return [FileSharePath(**r) for r in rows] + + def get_job(self, job_id: int, username: str): + result = self._call("get_job", job_id=job_id, username=username) + return SimpleNamespace(**result) if result else None + + +# Whitelist of method names the parent will dispatch; used by worker_pool +# when handling inbound db_request messages. +DB_METHODS = frozenset({"get_file_share_paths", "get_job"}) + + +def serialize_db_result(method: str, value): + """Convert a LocalDbProxy result into a JSON-serializable form. + + Called by the parent before sending a db_response back to the worker. + Keeps the wire format consistent regardless of which backend produced + the value. + """ + if value is None: + return None + if method == "get_file_share_paths": + # value is a list of FileSharePath models + return [fsp.model_dump(mode="json") for fsp in value] + if method == "get_job": + # value is a SimpleNamespace; vars() gives the underlying dict + return vars(value) + raise ValueError(f"Unknown db method: {method}") + + # --------------------------------------------------------------------------- # Action handlers — file operations # --------------------------------------------------------------------------- @@ -184,7 +291,7 @@ def _get_user_groups(username: str) -> list[str]: return names -def _get_filestore(fsp_name: str, db_url: str): +def _get_filestore(fsp_name: str, fsps: list): """Look up a FileSharePath and return a Filestore instance. Returns (filestore, None) on success, or (None, error_response) on failure @@ -194,16 +301,14 @@ def _get_filestore(fsp_name: str, db_url: str): if cached is not None: return cached, None - from fileglancer import database as db from fileglancer.filestore import Filestore - with db.get_db_session(db_url) as session: - fsp = db.get_file_share_path(session, fsp_name) - if fsp is None: - return None, { - "error": f"File share path '{fsp_name}' not found", - "status_code": 404, - } + fsp = next((f for f in fsps if f.name == fsp_name), None) + if fsp is None: + return None, { + "error": f"File share path '{fsp_name}' not found", + "status_code": 404, + } filestore = Filestore(fsp) try: @@ -221,56 +326,62 @@ def _get_filestore(fsp_name: str, db_url: str): def with_filestore(fn): """Resolve request["fsp_name"] to a Filestore and pass it as the third arg. + Also passes the freshly-fetched FSP list as the fourth arg, so handlers + that need it (for symlink resolution etc.) don't have to fetch again. + Returns an error response if the filestore can't be resolved (404 for a missing fsp, 503 for an unmounted one) so the handler body never has to deal with the not-found case. """ @functools.wraps(fn) def wrapper(request: dict, ctx: WorkerContext) -> dict: - filestore, error_response = _get_filestore(request["fsp_name"], ctx.db_url) + fsps = ctx.db.get_file_share_paths() + filestore, error_response = _get_filestore(request["fsp_name"], fsps) if filestore is None: return error_response - return fn(request, ctx, filestore) + return fn(request, ctx, filestore, fsps) return wrapper +def _redirect_or_error(e, fsps): + """Build a redirect response for a RootCheckError, or fall through to 400.""" + from fileglancer.database import find_fsp_in_paths + match = find_fsp_in_paths(fsps, e.full_path) + if match: + fsp, relative_subpath = match + return {"redirect": True, "fsp_name": fsp.name, "subpath": relative_subpath or ""} + return {"error": str(e), "status_code": 400} + + @action("list_dir") @with_filestore -def _action_list_dir(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_list_dir(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """List directory contents.""" subpath = request.get("subpath", "") current_user = ctx.username - from fileglancer import database as db from fileglancer.filestore import RootCheckError try: - with db.get_db_session(ctx.db_url) as session: - file_info = filestore.get_file_info(subpath, current_user=current_user, session=session) - result = {"info": file_info.model_dump(mode="json")} - - if file_info.is_dir: - try: - files = list(filestore.yield_file_infos(subpath, current_user=current_user, session=session)) - result["files"] = [f.model_dump(mode="json") for f in files] - except PermissionError: - result["files"] = [] - result["error"] = "Permission denied when listing directory contents" - result["status_code"] = 403 - except FileNotFoundError: - result["files"] = [] - result["error"] = "Directory contents not found" - result["status_code"] = 404 + file_info = filestore.get_file_info(subpath, current_user=current_user, fsps=fsps) + result = {"info": file_info.model_dump(mode="json")} + + if file_info.is_dir: + try: + files = list(filestore.yield_file_infos(subpath, current_user=current_user, fsps=fsps)) + result["files"] = [f.model_dump(mode="json") for f in files] + except PermissionError: + result["files"] = [] + result["error"] = "Permission denied when listing directory contents" + result["status_code"] = 403 + except FileNotFoundError: + result["files"] = [] + result["error"] = "Directory contents not found" + result["status_code"] = 404 return result except RootCheckError as e: - # Path escapes root — check if it belongs to another file share - with db.get_db_session(ctx.db_url) as session: - match = db.find_fsp_from_absolute_path(session, e.full_path) - if match: - fsp, relative_subpath = match - return {"redirect": True, "fsp_name": fsp.name, "subpath": relative_subpath or ""} - return {"error": str(e), "status_code": 400} + return _redirect_or_error(e, fsps) except FileNotFoundError: return {"error": "File or directory not found", "status_code": 404} except PermissionError: @@ -279,48 +390,41 @@ def _action_list_dir(request: dict, ctx: WorkerContext, filestore) -> dict: @action("list_dir_paged") @with_filestore -def _action_list_dir_paged(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_list_dir_paged(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """List directory contents with pagination.""" subpath = request.get("subpath", "") current_user = ctx.username limit = request.get("limit", 200) cursor = request.get("cursor") - from fileglancer import database as db from fileglancer.filestore import RootCheckError try: - with db.get_db_session(ctx.db_url) as session: - file_info = filestore.get_file_info(subpath, current_user=current_user, session=session) - result = {"info": file_info.model_dump(mode="json")} - - if file_info.is_dir: - try: - files, has_more, next_cursor, total_count = filestore.yield_file_infos_paginated( - subpath, current_user=current_user, session=session, - limit=limit, cursor=cursor - ) - result["files"] = [f.model_dump(mode="json") for f in files] - result["has_more"] = has_more - result["next_cursor"] = next_cursor - result["total_count"] = total_count - except PermissionError: - result["files"] = [] - result["error"] = "Permission denied when listing directory contents" - result["status_code"] = 403 - except FileNotFoundError: - result["files"] = [] - result["error"] = "Directory contents not found" - result["status_code"] = 404 + file_info = filestore.get_file_info(subpath, current_user=current_user, fsps=fsps) + result = {"info": file_info.model_dump(mode="json")} + + if file_info.is_dir: + try: + files, has_more, next_cursor, total_count = filestore.yield_file_infos_paginated( + subpath, current_user=current_user, fsps=fsps, + limit=limit, cursor=cursor + ) + result["files"] = [f.model_dump(mode="json") for f in files] + result["has_more"] = has_more + result["next_cursor"] = next_cursor + result["total_count"] = total_count + except PermissionError: + result["files"] = [] + result["error"] = "Permission denied when listing directory contents" + result["status_code"] = 403 + except FileNotFoundError: + result["files"] = [] + result["error"] = "Directory contents not found" + result["status_code"] = 404 return result except RootCheckError as e: - with db.get_db_session(ctx.db_url) as session: - match = db.find_fsp_from_absolute_path(session, e.full_path) - if match: - fsp, relative_subpath = match - return {"redirect": True, "fsp_name": fsp.name, "subpath": relative_subpath or ""} - return {"error": str(e), "status_code": 400} + return _redirect_or_error(e, fsps) except FileNotFoundError: return {"error": "File or directory not found", "status_code": 404} except PermissionError: @@ -329,16 +433,13 @@ def _action_list_dir_paged(request: dict, ctx: WorkerContext, filestore) -> dict @action("get_file_info") @with_filestore -def _action_get_file_info(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_get_file_info(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """Get metadata for a single file or directory.""" subpath = request.get("subpath", "") - from fileglancer import database as db - try: - with db.get_db_session(ctx.db_url) as session: - file_info = filestore.get_file_info(subpath, current_user=ctx.username, session=session) - return {"info": file_info.model_dump(mode="json")} + file_info = filestore.get_file_info(subpath, current_user=ctx.username, fsps=fsps) + return {"info": file_info.model_dump(mode="json")} except FileNotFoundError: return {"error": "File or directory not found", "status_code": 404} except PermissionError: @@ -347,7 +448,7 @@ def _action_get_file_info(request: dict, ctx: WorkerContext, filestore) -> dict: @action("check_binary") @with_filestore -def _action_check_binary(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_check_binary(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """Check if a file is binary.""" subpath = request.get("subpath", "") @@ -362,7 +463,7 @@ def _action_check_binary(request: dict, ctx: WorkerContext, filestore) -> dict: @action("open_file") @with_filestore -def _action_open_file(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_open_file(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """Open a file and return its metadata + open file descriptor. The worker opens the file as the user and passes the fd back to the @@ -395,13 +496,7 @@ def _action_open_file(request: dict, ctx: WorkerContext, filestore) -> dict: "_file_handle": file_handle, # kept alive until fd is sent } except RootCheckError as e: - from fileglancer import database as db - with db.get_db_session(ctx.db_url) as session: - match = db.find_fsp_from_absolute_path(session, e.full_path) - if match: - fsp, relative_subpath = match - return {"redirect": True, "fsp_name": fsp.name, "subpath": relative_subpath or ""} - return {"error": str(e), "status_code": 400} + return _redirect_or_error(e, fsps) except FileNotFoundError: return {"error": f"File or directory not found: {fsp_name}/{subpath}", "status_code": 404} except PermissionError: @@ -410,7 +505,7 @@ def _action_open_file(request: dict, ctx: WorkerContext, filestore) -> dict: @action("head_file") @with_filestore -def _action_head_file(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_head_file(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """Get file metadata and binary check for HEAD requests.""" subpath = request.get("subpath", "") @@ -429,13 +524,7 @@ def _action_head_file(request: dict, ctx: WorkerContext, filestore) -> dict: "is_binary": is_binary, } except RootCheckError as e: - from fileglancer import database as db - with db.get_db_session(ctx.db_url) as session: - match = db.find_fsp_from_absolute_path(session, e.full_path) - if match: - fsp, relative_subpath = match - return {"redirect": True, "fsp_name": fsp.name, "subpath": relative_subpath or ""} - return {"error": str(e), "status_code": 400} + return _redirect_or_error(e, fsps) except FileNotFoundError: return {"error": "File or directory not found", "status_code": 404} except PermissionError: @@ -444,7 +533,7 @@ def _action_head_file(request: dict, ctx: WorkerContext, filestore) -> dict: @action("create_dir") @with_filestore -def _action_create_dir(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_create_dir(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """Create a directory.""" subpath = request["subpath"] @@ -459,7 +548,7 @@ def _action_create_dir(request: dict, ctx: WorkerContext, filestore) -> dict: @action("create_file") @with_filestore -def _action_create_file(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_create_file(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """Create an empty file.""" subpath = request["subpath"] @@ -474,7 +563,7 @@ def _action_create_file(request: dict, ctx: WorkerContext, filestore) -> dict: @action("rename") @with_filestore -def _action_rename(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_rename(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """Rename a file or directory.""" old_path = request["old_path"] new_path = request["new_path"] @@ -490,7 +579,7 @@ def _action_rename(request: dict, ctx: WorkerContext, filestore) -> dict: @action("delete") @with_filestore -def _action_delete(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_delete(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """Delete a file or directory.""" subpath = request["subpath"] @@ -505,7 +594,7 @@ def _action_delete(request: dict, ctx: WorkerContext, filestore) -> dict: @action("chmod") @with_filestore -def _action_chmod(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_chmod(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """Change file permissions.""" subpath = request["subpath"] permissions = request["permissions"] @@ -521,7 +610,7 @@ def _action_chmod(request: dict, ctx: WorkerContext, filestore) -> dict: @action("update_file") @with_filestore -def _action_update_file(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_update_file(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """Handle rename and/or permission change on a file.""" subpath = request.get("subpath", "") new_path = request.get("new_path") @@ -549,38 +638,33 @@ def _action_update_file(request: dict, ctx: WorkerContext, filestore) -> dict: def _action_validate_paths(request: dict, ctx: WorkerContext) -> dict: """Validate file/directory paths for app parameters.""" from fileglancer.apps.core import validate_path_in_filestore - from fileglancer import database as db paths = request["paths"] + fsps = ctx.db.get_file_share_paths() errors = {} - with db.get_db_session(ctx.db_url) as session: - for param_key, path_value in paths.items(): - error = validate_path_in_filestore(path_value, session) - if error: - errors[param_key] = error + for param_key, path_value in paths.items(): + error = validate_path_in_filestore(path_value, fsps) + if error: + errors[param_key] = error return {"errors": errors} @action("get_profile") def _action_get_profile(request: dict, ctx: WorkerContext) -> dict: """Get user profile information.""" - from fileglancer import database as db - username = ctx.username + paths = ctx.db.get_file_share_paths() - with db.get_db_session(ctx.db_url) as session: - paths = db.get_file_share_paths(session) - - home_fsp = next((fsp for fsp in paths if fsp.mount_path in ('~', '~/')), None) - if home_fsp: - home_directory_name = "." - else: - home_directory_path = os.path.expanduser(f"~{username}") - home_parent = os.path.dirname(home_directory_path) - home_fsp = next((fsp for fsp in paths if fsp.mount_path == home_parent), None) - home_directory_name = os.path.basename(home_directory_path) + home_fsp = next((fsp for fsp in paths if fsp.mount_path in ('~', '~/')), None) + if home_fsp: + home_directory_name = "." + else: + home_directory_path = os.path.expanduser(f"~{username}") + home_parent = os.path.dirname(home_directory_path) + home_fsp = next((fsp for fsp in paths if fsp.mount_path == home_parent), None) + home_directory_name = os.path.basename(home_directory_path) - home_fsp_name = home_fsp.name if home_fsp else None + home_fsp_name = home_fsp.name if home_fsp else None user_groups = [] try: @@ -637,10 +721,15 @@ def _action_generate_ssh_key(request: dict, ctx: WorkerContext) -> dict: @action("get_job_file") def _action_get_job_file(request: dict, ctx: WorkerContext) -> dict: """Read job file content (script, stdout, stderr).""" - from fileglancer.apps.core import get_job_file_content + from fileglancer.apps.core import read_job_file job_id = request["job_id"] file_type = request["file_type"] - content = get_job_file_content(job_id, ctx.username, file_type) + + db_job = ctx.db.get_job(job_id, ctx.username) + if db_job is None: + return {"error": f"Job {job_id} not found", "status_code": 404} + + content = read_job_file(db_job, file_type) if content is None: return {"content": None} return {"content": content} @@ -650,15 +739,13 @@ def _action_get_job_file(request: dict, ctx: WorkerContext) -> dict: def _action_get_job_file_paths(request: dict, ctx: WorkerContext) -> dict: """Get job file path info.""" from fileglancer.apps.core import get_job_file_paths - from fileglancer import database as db job_id = request["job_id"] - - with db.get_db_session(ctx.db_url) as session: - db_job = db.get_job(session, job_id, ctx.username) - if db_job is None: - return {"error": f"Job {job_id} not found", "status_code": 404} - files = get_job_file_paths(db_job) + db_job = ctx.db.get_job(job_id, ctx.username) + if db_job is None: + return {"error": f"Job {job_id} not found", "status_code": 404} + fsps = ctx.db.get_file_share_paths() + files = get_job_file_paths(db_job, fsps) return {"files": files} @@ -666,15 +753,12 @@ def _action_get_job_file_paths(request: dict, ctx: WorkerContext) -> dict: def _action_get_service_url(request: dict, ctx: WorkerContext) -> dict: """Read service URL from job work directory.""" from fileglancer.apps.core import get_service_url - from fileglancer import database as db job_id = request["job_id"] - - with db.get_db_session(ctx.db_url) as session: - db_job = db.get_job(session, job_id, ctx.username) - if db_job is None: - return {"error": f"Job {job_id} not found", "status_code": 404} - url = get_service_url(db_job) + db_job = ctx.db.get_job(job_id, ctx.username) + if db_job is None: + return {"error": f"Job {job_id} not found", "status_code": 404} + url = get_service_url(db_job) return {"service_url": url} @@ -788,7 +872,7 @@ def _action_s3_open_object(request: dict, ctx: WorkerContext) -> dict: @action("validate_proxied_path") @with_filestore -def _action_validate_proxied_path(request: dict, ctx: WorkerContext, filestore) -> dict: +def _action_validate_proxied_path(request: dict, ctx: WorkerContext, filestore, fsps) -> dict: """Validate that the user can access a proxied path. Runs within the user's context (the worker IS the user), so @@ -966,9 +1050,9 @@ def _action_read_manifest(request: dict, ctx: WorkerContext) -> dict: class WorkerContext: """Holds per-worker state.""" - def __init__(self, username: str, db_url: str): + def __init__(self, username: str, db): self.username = username - self.db_url = db_url + self.db = db def main(): @@ -1005,8 +1089,9 @@ def main(): except KeyError: username = str(uid) - db_url = os.environ.get("FGC_DB_URL", "") - ctx = WorkerContext(username=username, db_url=db_url) + # Worker subprocess never gets DB credentials; all DB access goes back + # through the parent over the same socket via RpcDbProxy. + ctx = WorkerContext(username=username, db=RpcDbProxy(sock)) logger.info( f"Worker started for {username} " diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index 2481e43b..2956500a 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -71,10 +71,11 @@ class UserWorker: """ def __init__(self, username: str, process: subprocess.Popen, - sock: socket.socket): + sock: socket.socket, db_proxy): self.username = username self.process = process self.sock = sock + self.db_proxy = db_proxy # LocalDbProxy used to satisfy worker db_requests self.last_activity = time.monotonic() self._busy = False self._lock = asyncio.Lock() # serialize requests to the worker @@ -130,22 +131,38 @@ async def execute(self, action: str, **kwargs) -> Any: self.last_activity = time.monotonic() def _send_and_recv(self, request: dict) -> dict: - """Send a request and receive the response (blocking, runs in thread). + """Send a request and receive the action response (blocking, runs in thread). - All receives use recvmsg() so that SCM_RIGHTS file descriptors are - captured transparently — the ancillary data arrives with the first - bytes of the message, so we must use recvmsg for the header too. + Loops on receive: any inbound ``_kind == "db_request"`` message is a + reverse-RPC from the worker (which has no DB credentials) asking the + parent to run a DB query on its behalf. We dispatch it, send back a + ``db_response``, and keep reading. Anything else is the action result. """ - # Send - payload = json.dumps(request).encode() + self._send_msg(request) + + while True: + response = self._recv_msg() + if response.get("_kind") == "db_request": + self._handle_db_request(response) + continue + return response + + def _send_msg(self, msg: dict): + """Send a length-prefixed JSON message.""" + payload = json.dumps(msg).encode() header = struct.pack(_HEADER_FMT, len(payload)) self.sock.sendall(header + payload) - # Receive header + payload + optional fd, all via recvmsg + def _recv_msg(self) -> dict: + """Receive one length-prefixed JSON message, capturing any SCM_RIGHTS fd. + + All receives use recvmsg() so that SCM_RIGHTS file descriptors are + captured transparently — the ancillary data arrives with the first + bytes of the message, so we must use recvmsg for the header too. + """ fds = array.array("i") raw = b"" try: - # First, read at least the header while len(raw) < _HEADER_SIZE: msg, ancdata, flags, addr = self.sock.recvmsg( max(_HEADER_SIZE - len(raw), 4096), @@ -162,7 +179,6 @@ def _send_and_recv(self, request: dict) -> dict: if length > _MAX_MESSAGE_SIZE: raise WorkerError(f"Response too large: {length} bytes") - # We may have read some payload bytes with the header total_needed = _HEADER_SIZE + length while len(raw) < total_needed: msg, ancdata, flags, addr = self.sock.recvmsg( @@ -187,7 +203,6 @@ def _send_and_recv(self, request: dict) -> dict: pass raise - # If an fd arrived, wrap it in a file object and close any extras if fds: response["_file_handle"] = os.fdopen(fds[0], "rb") for extra_fd in fds[1:]: @@ -198,6 +213,32 @@ def _send_and_recv(self, request: dict) -> dict: return response + def _handle_db_request(self, request: dict): + """Run a DB query on behalf of the worker and send the result back.""" + from fileglancer.user_worker import DB_METHODS, serialize_db_result + + method = request.get("method") + kwargs = request.get("kwargs", {}) or {} + if method not in DB_METHODS: + self._send_msg({ + "_kind": "db_response", + "ok": False, + "error": f"Unknown db method: {method}", + }) + return + + try: + value = getattr(self.db_proxy, method)(**kwargs) + result = serialize_db_result(method, value) + self._send_msg({"_kind": "db_response", "ok": True, "result": result}) + except Exception as e: + logger.exception(f"db_request {method} for {self.username} failed") + self._send_msg({ + "_kind": "db_response", + "ok": False, + "error": f"{type(e).__name__}: {e}", + }) + async def shutdown(self, timeout: float = 5.0): """Ask the worker to shut down gracefully, then force-kill if needed.""" if not self.is_alive: @@ -233,12 +274,17 @@ class WorkerPool: """ def __init__(self, settings: Settings): + from fileglancer.user_worker import LocalDbProxy + self.settings = settings self._workers: dict[str, UserWorker] = {} self._locks: dict[str, asyncio.Lock] = {} self._eviction_task: Optional[asyncio.Task] = None self.max_workers = settings.worker_pool_max_workers self.idle_timeout = settings.worker_pool_idle_timeout + # All worker DB requests are satisfied by this proxy in the parent; + # the worker subprocess never sees the DB URL. + self._db_proxy = LocalDbProxy(settings.db_url) def _get_lock(self, username: str) -> asyncio.Lock: if username not in self._locks: @@ -296,13 +342,17 @@ async def _spawn_worker(self, username: str) -> UserWorker: # Create Unix socketpair for IPC parent_sock, child_sock = socket.socketpair() + # Worker subprocess deliberately does NOT receive the DB URL — it + # runs as the (untrusted) target user, so credentials would be + # readable via /proc//environ. All DB queries reverse-RPC back + # to the parent over the IPC socket instead. env = { **os.environ, "HOME": pw.pw_dir, "FGC_LOG_LEVEL": self.settings.log_level, - "FGC_DB_URL": self.settings.db_url, "FGC_WORKER_FD": str(child_sock.fileno()), } + env.pop("FGC_DB_URL", None) logger.info( f"Spawning persistent worker for {username} " @@ -328,7 +378,7 @@ async def _spawn_worker(self, username: str) -> UserWorker: # Start a background task to forward worker stderr to loguru asyncio.create_task(self._forward_stderr(username, process)) - return UserWorker(username, process, parent_sock) + return UserWorker(username, process, parent_sock, self._db_proxy) async def _forward_stderr(self, username: str, process: subprocess.Popen): """Forward worker stderr lines to loguru in the background. diff --git a/tests/test_apps.py b/tests/test_apps.py index 5783cb97..f6031316 100644 --- a/tests/test_apps.py +++ b/tests/test_apps.py @@ -499,9 +499,7 @@ class TestValidatePathInFilestore: def test_path_outside_any_share(self): """Path not in any file share returns an error.""" - mock_session = MagicMock() - with patch("fileglancer.database.find_fsp_from_absolute_path", return_value=None): - error = validate_path_in_filestore("/nowhere/file.txt", mock_session) + error = validate_path_in_filestore("/nowhere/file.txt", []) assert error is not None assert "not within an allowed file share" in error @@ -513,17 +511,12 @@ def test_valid_path_in_share(self, tmp_path): from fileglancer.model import FileSharePath fsp = FileSharePath(zone="test", name="test", mount_path=str(tmp_path)) - - mock_session = MagicMock() - with patch("fileglancer.database.find_fsp_from_absolute_path", - return_value=(fsp, "data.txt")): - error = validate_path_in_filestore(str(test_file), mock_session) + error = validate_path_in_filestore(str(test_file), [fsp]) assert error is None def test_syntax_error_short_circuits(self): - """Metachar in path returns error before DB lookup.""" - mock_session = MagicMock() - error = validate_path_in_filestore("/data;bad", mock_session) + """Metachar in path returns error before path lookup.""" + error = validate_path_in_filestore("/data;bad", []) assert error is not None assert "invalid characters" in error diff --git a/tests/test_worker.py b/tests/test_worker.py index 3f451af0..2cc294dd 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -182,7 +182,7 @@ def poll(self): return None def wait(self): pass def kill(self): pass - worker = UserWorker("testuser", FakeProcess(), parent) + worker = UserWorker("testuser", FakeProcess(), parent, db_proxy=None) return worker, child def test_send_and_recv_basic(self): @@ -277,7 +277,7 @@ def poll(self): return None def wait(self): pass def kill(self): pass - worker = UserWorker("testuser", FakeProcess(), parent) + worker = UserWorker("testuser", FakeProcess(), parent, db_proxy=None) return worker, child @pytest.mark.asyncio @@ -330,7 +330,7 @@ def poll(self): return 1 def wait(self): pass def kill(self): pass - worker = UserWorker("testuser", DeadProcess(), parent) + worker = UserWorker("testuser", DeadProcess(), parent, db_proxy=None) with pytest.raises(WorkerDead): await worker.execute("anything") parent.close() @@ -432,10 +432,12 @@ def temp_dir(self): @pytest.fixture def ctx(self, temp_dir): - """Create a WorkerContext with a test database.""" + """Create a WorkerContext with a LocalDbProxy backed by the test database.""" from fileglancer.settings import get_settings + from fileglancer.user_worker import LocalDbProxy settings = get_settings() - return WorkerContext(username=os.environ.get("USER", "test"), db_url=settings.db_url) + return WorkerContext(username=os.environ.get("USER", "test"), + db=LocalDbProxy(settings.db_url)) def test_get_profile(self, ctx): handler = _ACTIONS["get_profile"] @@ -475,8 +477,9 @@ def target(): username = str(uid) from fileglancer.settings import get_settings + from fileglancer.user_worker import LocalDbProxy settings = get_settings() - ctx = WorkerContext(username=username, db_url=settings.db_url) + ctx = WorkerContext(username=username, db=LocalDbProxy(settings.db_url)) while True: try: From e74e58c28bec9a552b935854acbac8078c7aa946 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Fri, 8 May 2026 21:41:28 -0400 Subject: [PATCH 36/39] log worker actions --- fileglancer/worker_pool.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fileglancer/worker_pool.py b/fileglancer/worker_pool.py index 2956500a..659a525c 100644 --- a/fileglancer/worker_pool.py +++ b/fileglancer/worker_pool.py @@ -104,6 +104,8 @@ async def execute(self, action: str, **kwargs) -> Any: if not self.is_alive: raise WorkerDead(f"Worker for {self.username} is dead (rc={self.process.returncode})") + logger.debug(f"Delegating {action} to worker for {self.username} (pid={self.process.pid})") + async with self._lock: self._busy = True self.last_activity = time.monotonic() From de9246ddc83438ba7863c7fea73afe23a4965932 Mon Sep 17 00:00:00 2001 From: Konrad Rokicki Date: Sat, 9 May 2026 21:50:57 -0400 Subject: [PATCH 37/39] route apps and cluster ops through persistent worker pool Dispatches ensure_repo, discover_manifests, read_manifest, submit, cancel, poll, and reconnect through _worker_exec instead of spawning a fresh fileglancer.apps.worker subprocess per call. Deletes apps/worker.py and the parent's now-dead cluster_api stdlib-logging interceptor. Co-Authored-By: Claude Opus 4.7 (1M context) --- fileglancer/apps/core.py | 189 ++++++++------------------ fileglancer/apps/worker.py | 268 ------------------------------------- fileglancer/server.py | 27 +--- fileglancer/user_worker.py | 9 ++ tests/test_poll.py | 35 ++--- tests/test_worker.py | 2 + 6 files changed, 91 insertions(+), 439 deletions(-) delete mode 100644 fileglancer/apps/worker.py diff --git a/fileglancer/apps/core.py b/fileglancer/apps/core.py index 4f71e956..89e679cb 100644 --- a/fileglancer/apps/core.py +++ b/fileglancer/apps/core.py @@ -6,18 +6,14 @@ except ImportError: fcntl = None # type: ignore[assignment] try: - import grp import pwd except ImportError: - grp = None # type: ignore[assignment] pwd = None # type: ignore[assignment] -import json import os import re import shlex import shutil import subprocess -import sys import tempfile from pathlib import Path from datetime import datetime, UTC @@ -36,6 +32,26 @@ from fileglancer.settings import get_settings +# Registered by server.py at startup. Dispatches an action to the per-user +# persistent worker (or in-process in dev mode). Signature mirrors +# server._worker_exec: (username, action, **kwargs) -> awaitable[dict]. +_worker_exec = None + + +def set_worker_exec(fn): + """Register the persistent worker dispatcher. Called from server lifespan.""" + global _worker_exec + _worker_exec = fn + + +async def _dispatch(username: str, action: str, **kwargs) -> dict: + if _worker_exec is None: + raise RuntimeError( + "Worker dispatcher not registered — apps module used before server startup" + ) + return await _worker_exec(username, action, **kwargs) + + _MANIFEST_FILENAME = "runnables.yaml" def _repo_cache_base(username: str | None = None) -> Path: @@ -157,17 +173,9 @@ async def _ensure_repo_cache(url: str, pull: bool = False, branch = await _resolve_default_branch(clone_url) if username: - logger.debug( - f"Delegating ensure_repo to worker for user={username} " - f"repo={owner}/{repo} ({branch}) pull={pull}" - ) lock = _get_repo_lock(owner, repo, branch) async with lock: - result = await _run_as_user_async(username, { - "action": "ensure_repo", - "url": url, - "pull": pull, - }) + result = await _dispatch(username, "ensure_repo", url=url, pull=pull) return Path(result["repo_dir"]) # Running as the current user (worker subprocess or dev mode) @@ -281,11 +289,7 @@ async def discover_app_manifests(url: str, running as the target user. """ if username: - logger.debug(f"Delegating discover_manifests to worker for user={username} url={url}") - result = await _run_as_user_async(username, { - "action": "discover_manifests", - "url": url, - }) + result = await _dispatch(username, "discover_manifests", url=url) return [ (item["path"], AppManifest(**item["manifest"])) for item in result["manifests"] @@ -305,12 +309,7 @@ async def fetch_app_manifest(url: str, manifest_path: str = "", running as the target user. """ if username: - logger.debug(f"Delegating read_manifest to worker for user={username} url={url}") - result = await _run_as_user_async(username, { - "action": "read_manifest", - "url": url, - "manifest_path": manifest_path, - }) + result = await _dispatch(username, "read_manifest", url=url, manifest_path=manifest_path) return AppManifest(**result["manifest"]) repo_dir = await _ensure_repo_cache(url) @@ -683,85 +682,14 @@ def build_command(entry_point: AppEntryPoint, parameters: dict, session=None) -> return (" \\\n ").join(parts) -def _run_as_user(username: str, request: dict) -> dict: - """Run a worker action as the given user in a subprocess. - - Spawns a child process with the target user's identity using - Python 3.9+ ``user``/``group``/``extra_groups`` subprocess kwargs. - The child runs fileglancer.apps.worker, which creates a fresh - py-cluster-api executor and performs the requested action. - - Returns the parsed JSON response from the worker. - Raises ValueError on worker failure. - """ - pw = pwd.getpwnam(username) - action = request.get("action", "unknown") - - # Only switch identity if running as root; otherwise we're already - # the target user (e.g. development mode). - identity_kwargs: dict = {} - if os.geteuid() == 0: - groups = [g.gr_gid for g in grp.getgrall() if username in g.gr_mem] - if pw.pw_gid not in groups: - groups.append(pw.pw_gid) - identity_kwargs = { - "user": pw.pw_uid, - "group": pw.pw_gid, - "extra_groups": groups, - } - logger.debug( - f"Spawning worker action={action} as user={username} " - f"uid={pw.pw_uid} gid={pw.pw_gid} HOME={pw.pw_dir}" - ) - else: - logger.debug( - f"Spawning worker action={action} as current user " - f"(euid={os.geteuid()})" - ) - - result = subprocess.run( - [sys.executable, "-m", "fileglancer.apps.worker"], - input=json.dumps(request).encode(), - capture_output=True, - env={**os.environ, "HOME": pw.pw_dir, "FGC_LOG_LEVEL": get_settings().log_level}, - **identity_kwargs, - ) - - # Forward worker stderr (contains cluster_api and worker logs) - if result.stderr: - for line in result.stderr.decode().rstrip().splitlines(): - logger.debug(f"[worker:{action}] {line}") - - if result.stdout: - try: - response = json.loads(result.stdout) - except json.JSONDecodeError: - raise ValueError( - f"Worker produced invalid JSON: {result.stdout.decode()[:500]}" - ) - else: - response = {} - - if result.returncode != 0: - error = response.get("error", result.stderr.decode()[:500]) - raise ValueError(f"Worker failed: {error}") - - return response - - -async def _run_as_user_async(username: str, request: dict) -> dict: - """Async wrapper for _run_as_user that doesn't block the event loop.""" - return await asyncio.to_thread(_run_as_user, username, request) - - # --- Job Monitoring --- # # The server process runs as root, which cannot execute LSF commands # (bjobs, bsub, bkill) due to HPC root-squash policy. All LSF -# operations go through worker subprocesses running as a real user. +# operations go through the persistent per-user worker pool. # -# The poll loop picks any user with active jobs and spawns a worker -# that runs ``bjobs -u all`` to get statuses for ALL users' jobs. +# The poll loop picks any user with active jobs and dispatches ``bjobs +# -u all`` through that user's worker to get statuses for ALL users' jobs. _poll_task = None _POLL_LOCK_PATH = os.path.join(tempfile.gettempdir(), "fileglancer_poll.lock") @@ -780,7 +708,7 @@ async def start_job_monitor(): try: with open(_POLL_LOCK_PATH, "w") as f: fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) - _reconnect_as_any_user(settings) + await _reconnect_as_any_user(settings) fcntl.flock(f, fcntl.LOCK_UN) logger.info("Job monitor started (reconnected existing jobs)") except OSError: @@ -836,8 +764,8 @@ def _get_any_active_username(settings) -> str | None: return None -def _reconnect_as_any_user(settings): - """Reconnect to existing cluster jobs via a worker subprocess. +async def _reconnect_as_any_user(settings): + """Reconnect to existing cluster jobs via the persistent worker. Picks any user with active jobs to run bjobs as. If no active jobs exist, reconnection is skipped (nothing to reconnect to). @@ -849,11 +777,8 @@ def _reconnect_as_any_user(settings): cluster_config = settings.cluster.model_dump(exclude_none=True) try: - result = _run_as_user(username, { - "action": "reconnect", - "cluster_config": cluster_config, - }) - except ValueError as e: + result = await _dispatch(username, "reconnect", cluster_config=cluster_config) + except Exception as e: logger.debug(f"Job reconnection skipped: {e}") return @@ -880,7 +805,7 @@ def _reconnect_as_any_user(settings): async def _poll_loop(settings): - """Periodically poll cluster job statuses via a worker subprocess. + """Periodically poll cluster job statuses via the persistent worker. All uvicorn workers run this loop, but only the one that acquires the file lock actually polls. The lock is held through both the @@ -899,7 +824,7 @@ async def _poll_loop(settings): lock_fd = open(_POLL_LOCK_PATH, "w") fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) try: - has_jobs = _poll_jobs(settings) + has_jobs = await _poll_jobs(settings) except Exception: logger.exception("Error in job poll loop") has_jobs = True # keep polling on error @@ -920,7 +845,7 @@ async def _poll_loop(settings): await asyncio.sleep(settings.cluster.poll_interval) -def _poll_jobs(settings): +async def _poll_jobs(settings): """Run one poll cycle: query bjobs via worker, update DB. Returns True if there are active jobs to continue polling, @@ -967,13 +892,13 @@ def _poll_jobs(settings): cluster_config = settings.cluster.model_dump(exclude_none=True) try: - result = _run_as_user(poll_username, { - "action": "poll", - "cluster_config": cluster_config, - "cluster_job_ids": list(job_statuses.keys()), - "job_statuses": job_statuses, - }) - except ValueError as e: + result = await _dispatch( + poll_username, "poll", + cluster_config=cluster_config, + cluster_job_ids=list(job_statuses.keys()), + job_statuses=job_statuses, + ) + except Exception as e: logger.warning(f"Poll failed: {e}") return True # keep polling on error @@ -1345,18 +1270,18 @@ async def submit_job( resource_spec.stdout_path = str(work_dir / "stdout.log") resource_spec.stderr_path = str(work_dir / "stderr.log") - # Submit to the cluster as the target user. The worker subprocess - # creates the work directory, symlinks the repo, and calls + # Submit to the cluster as the target user via the persistent worker: + # it creates the work directory, symlinks the repo, and calls # executor.submit() — all with the user's identity. job_name = f"{manifest.name}-{entry_point.id}" cluster_config = settings.cluster.model_dump(exclude_none=True) try: - worker_result = _run_as_user(username, { - "action": "submit", - "cluster_config": cluster_config, - "command": full_command, - "job_name": job_name, - "resources": { + worker_result = await _dispatch( + username, "submit", + cluster_config=cluster_config, + command=full_command, + job_name=job_name, + resources={ "cpus": resource_spec.cpus, "gpus": resource_spec.gpus, "memory": resource_spec.memory, @@ -1368,9 +1293,9 @@ async def submit_job( "extra_directives": resource_spec.extra_directives, "extra_args": resource_spec.extra_args, }, - "work_dir": str(work_dir), - "cached_repo_dir": str(cached_repo_dir), - }) + work_dir=str(work_dir), + cached_repo_dir=str(cached_repo_dir), + ) except Exception: # Cluster submission failed — remove the PENDING DB record so # the job does not appear in the user's jobs list. @@ -1450,11 +1375,11 @@ async def cancel_job(job_id: int, username: str) -> db.JobDB: # Cancel on cluster as the target user if db_job.cluster_job_id: cluster_config = settings.cluster.model_dump(exclude_none=True) - _run_as_user(username, { - "action": "cancel", - "cluster_config": cluster_config, - "job_id": db_job.cluster_job_id, - }) + await _dispatch( + username, "cancel", + cluster_config=cluster_config, + job_id=db_job.cluster_job_id, + ) # Update DB now = datetime.now(UTC) diff --git a/fileglancer/apps/worker.py b/fileglancer/apps/worker.py deleted file mode 100644 index f5170bdb..00000000 --- a/fileglancer/apps/worker.py +++ /dev/null @@ -1,268 +0,0 @@ -"""Subprocess worker for running operations as a target user. - -This module is invoked as a subprocess by fileglancer to run py-cluster-api -operations (submit, cancel, poll) and git/manifest operations (clone, pull, -read) with the identity of the authenticated user. The parent process uses -Python 3.9+ ``user``/``group``/``extra_groups`` subprocess kwargs to set the -child's identity before any code runs. - -Protocol: - - Input: JSON on stdin - - Output: JSON on stdout ({"job_id": ...} or {"error": ...}) - - Errors: non-zero exit code + JSON error on stdout - -Usage (called by fileglancer, not directly): - subprocess.run( - [sys.executable, "-m", "fileglancer.apps.worker"], - input=json.dumps(request).encode(), - capture_output=True, - env={**os.environ, "HOME": pw.pw_dir}, - user=uid, group=gid, extra_groups=groups, - ) -""" - -from __future__ import annotations - -import asyncio -import json -import os -import sys -from pathlib import Path - -from cluster_api import create_executor, ResourceSpec - - -async def _submit(request: dict) -> dict: - """Create work dir, symlink repo, submit job via py-cluster-api.""" - config = request["cluster_config"] - # extra_args are handled via ResourceSpec, not config - config.pop("extra_args", None) - - executor = create_executor(**config) - - work_dir = Path(request["work_dir"]) - work_dir.mkdir(parents=True, exist_ok=True) - - # Symlink the cached repo into the work directory - cached_repo_dir = request["cached_repo_dir"] - repo_link = work_dir / "repo" - if repo_link.is_symlink() or repo_link.exists(): - repo_link.unlink() - repo_link.symlink_to(cached_repo_dir) - - # Build ResourceSpec from the serialized dict - res = request["resources"] - resource_spec = ResourceSpec( - cpus=res.get("cpus"), - gpus=res.get("gpus"), - memory=res.get("memory"), - walltime=res.get("walltime"), - queue=res.get("queue"), - work_dir=res["work_dir"], - stdout_path=res.get("stdout_path"), - stderr_path=res.get("stderr_path"), - extra_directives=res.get("extra_directives"), - extra_args=res.get("extra_args"), - ) - - job = await executor.submit( - command=request["command"], - name=request["job_name"], - resources=resource_spec, - ) - - # For local executor, write the subprocess PID to disk so the poll - # loop can check process liveness across worker invocations. - # Only LocalExecutor has a _processes dict; HPC executors don't. - processes = getattr(executor, "_processes", None) - if processes is not None: - proc = processes.get(job.job_id) - if proc is not None: - pid_file = work_dir / "job.pid" - pid_file.write_text(str(proc.pid)) - - return {"job_id": job.job_id, "script_path": job.script_path} - - -async def _cancel(request: dict) -> dict: - """Cancel a cluster job via py-cluster-api.""" - config = request["cluster_config"] - config.pop("extra_args", None) - - executor = create_executor(**config) - await executor.cancel(request["job_id"]) - - return {"status": "ok"} - - -async def _poll(request: dict) -> dict: - """Poll job statuses via py-cluster-api (bjobs -u all). - - The executor needs to know which jobs to track, so we seed it with - the cluster_job_ids from the DB before polling. After poll(), we - return the updated statuses and metadata for each tracked job. - """ - from cluster_api._types import JobRecord, JobStatus - - config = request["cluster_config"] - config.pop("extra_args", None) - - executor = create_executor(**config) - - # Seed the executor with stub JobRecords so poll() knows what to track. - # poll() queries bjobs and updates these records in-place. - # Use each job's current DB status so that jobs not found in bjobs - # output keep their real status instead of reverting to PENDING. - known_statuses = request.get("job_statuses", {}) - for cid in request["cluster_job_ids"]: - db_status = known_statuses.get(cid, "PENDING").lower() - try: - seed_status = JobStatus(db_status) - except ValueError: - seed_status = JobStatus.PENDING - executor._jobs[cid] = JobRecord( - job_id=cid, - name="", - command="", - status=seed_status, - ) - - await executor.poll() - - # Return the updated state for each job - jobs = {} - for cid, record in executor.jobs.items(): - jobs[cid] = { - "status": record.status.value, - "exit_code": record.exit_code, - "exec_host": record.exec_host, - "start_time": record.start_time.isoformat() if record.start_time else None, - "finish_time": record.finish_time.isoformat() if record.finish_time else None, - } - - return {"jobs": jobs} - - -async def _reconnect(request: dict) -> dict: - """Reconnect to existing jobs via py-cluster-api (bjobs -u all).""" - config = request["cluster_config"] - config.pop("extra_args", None) - - executor = create_executor(**config) - reconnected = await executor.reconnect() - - jobs = {} - for record in reconnected: - jobs[record.job_id] = { - "status": record.status.value, - "name": record.name, - "exit_code": record.exit_code, - "exec_host": record.exec_host, - "start_time": record.start_time.isoformat() if record.start_time else None, - "finish_time": record.finish_time.isoformat() if record.finish_time else None, - } - - return {"jobs": jobs} - - -async def _ensure_repo(request: dict) -> dict: - """Clone or update a GitHub repo in the current user's cache.""" - from fileglancer.apps.core import _ensure_repo_cache - - repo_dir = await _ensure_repo_cache( - url=request["url"], - pull=request.get("pull", False), - ) - return {"repo_dir": str(repo_dir)} - - -async def _discover_manifests(request: dict) -> dict: - """Clone/pull repo and discover all manifests.""" - from fileglancer.apps.core import _ensure_repo_cache, _find_manifests_in_repo - - repo_dir = await _ensure_repo_cache( - url=request["url"], - pull=True, - ) - results = _find_manifests_in_repo(repo_dir) - return { - "manifests": [ - {"path": path, "manifest": manifest.model_dump(mode="json")} - for path, manifest in results - ] - } - - -async def _read_manifest(request: dict) -> dict: - """Fetch and read a single manifest from a cached repo.""" - from fileglancer.apps.core import _ensure_repo_cache, _read_manifest_file - - repo_dir = await _ensure_repo_cache( - url=request["url"], - pull=request.get("pull", False), - ) - manifest_path = request.get("manifest_path", "") - target_dir = repo_dir / manifest_path if manifest_path else repo_dir - manifest = _read_manifest_file(target_dir) - return {"manifest": manifest.model_dump(mode="json")} - - -_ACTIONS = { - "submit": _submit, - "cancel": _cancel, - "poll": _poll, - "reconnect": _reconnect, - "ensure_repo": _ensure_repo, - "discover_manifests": _discover_manifests, - "read_manifest": _read_manifest, -} - - -def main(): - import logging - import pwd as _pwd - - # Configure cluster_api logging so debug output reaches the parent - # process via stderr. The parent captures stderr separately. - log_level = os.environ.get("FGC_LOG_LEVEL", "INFO").upper() - # Map loguru-specific levels to their nearest stdlib equivalents - _LOGURU_TO_STDLIB = {"TRACE": "DEBUG", "SUCCESS": "INFO"} - log_level = _LOGURU_TO_STDLIB.get(log_level, log_level) - handler = logging.StreamHandler(sys.stderr) - handler.setFormatter(logging.Formatter( - "%(levelname)s | %(name)s:%(funcName)s:%(lineno)d - %(message)s" - )) - cluster_logger = logging.getLogger("cluster_api") - cluster_logger.addHandler(handler) - cluster_logger.setLevel(log_level) - - request = json.loads(sys.stdin.buffer.read()) - action = request.get("action") - - uid = os.getuid() - euid = os.geteuid() - try: - uname = _pwd.getpwuid(uid).pw_name - except KeyError: - uname = str(uid) - print( - f"[worker] action={action} uid={uid}({uname}) euid={euid} " - f"HOME={os.environ.get('HOME', '')}", - file=sys.stderr, - ) - - handler = _ACTIONS.get(action) - if handler is None: - json.dump({"error": f"Unknown action: {action}"}, sys.stdout) - sys.exit(1) - - try: - result = asyncio.run(handler(request)) - json.dump(result, sys.stdout) - except Exception as e: - json.dump({"error": str(e)}, sys.stdout) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/fileglancer/server.py b/fileglancer/server.py index c6d19e97..fecbbc11 100644 --- a/fileglancer/server.py +++ b/fileglancer/server.py @@ -1,4 +1,3 @@ -import logging import os import re import sys @@ -307,27 +306,6 @@ async def lifespan(app: FastAPI): logger.remove() logger.add(sys.stderr, level=settings.log_level) - # Intercept stdlib logging (e.g. py-cluster-api) into loguru - class InterceptHandler(logging.Handler): - def emit(self, record): - # Get corresponding loguru level - try: - level = logger.level(record.levelname).name - except ValueError: - level = record.levelno - # Find caller from where the log call originated - frame, depth = logging.currentframe(), 0 - while frame and frame.f_code.co_filename == logging.__file__: - frame = frame.f_back - depth += 1 - logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage()) - - # Attach directly to cluster_api logger so uvicorn can't clobber it - cluster_logger = logging.getLogger("cluster_api") - cluster_logger.handlers = [InterceptHandler()] - cluster_logger.setLevel(logging.DEBUG) - cluster_logger.propagate = False - def mask_password(url: str) -> str: """Mask password in database URL for logging""" import re @@ -397,6 +375,11 @@ def mask_password(url: str) -> str: await worker_pool.start_eviction_loop() logger.info("Worker pool started") + # Wire the apps module to dispatch through the persistent worker + # pool (or in-process in dev mode) instead of spawning ephemeral + # subprocesses. + apps_module.set_worker_exec(_worker_exec) + # Start cluster job monitor try: await apps_module.start_job_monitor() diff --git a/fileglancer/user_worker.py b/fileglancer/user_worker.py index 7f632e29..b0c8889f 100644 --- a/fileglancer/user_worker.py +++ b/fileglancer/user_worker.py @@ -935,6 +935,15 @@ def _action_submit(request: dict, ctx: WorkerContext) -> dict: resources=resource_spec, )) + # For LocalExecutor, persist the subprocess PID so the parent's poll + # loop can check liveness across calls. HPC executors don't have + # _processes and don't need this. + processes = getattr(executor, "_processes", None) + if processes is not None: + proc = processes.get(job.job_id) + if proc is not None: + (work_dir / "job.pid").write_text(str(proc.pid)) + return {"job_id": job.job_id, "script_path": job.script_path} diff --git a/tests/test_poll.py b/tests/test_poll.py index 01801e6b..151ceaf2 100644 --- a/tests/test_poll.py +++ b/tests/test_poll.py @@ -9,7 +9,7 @@ from datetime import datetime, UTC from pathlib import Path from types import SimpleNamespace -from unittest.mock import patch, MagicMock, call +from unittest.mock import patch, MagicMock, AsyncMock, call from fileglancer.apps.core import _poll_jobs, _poll_local_jobs, _POLL_LOCK_PATH @@ -56,9 +56,9 @@ class TestPollSkipsSameStatus: _poll_jobs must NOT call update_job_status. This was the bug that caused 'RUNNING -> RUNNING' log spam with multiple workers.""" - @patch("fileglancer.apps.core._run_as_user") + @patch("fileglancer.apps.core._dispatch", new_callable=AsyncMock) @patch("fileglancer.apps.core.db") - def test_same_status_not_written(self, mock_db, mock_run): + def test_same_status_not_written(self, mock_db, mock_dispatch): settings = _make_settings() job = _make_db_job(1, "1001", "RUNNING") @@ -68,7 +68,7 @@ def test_same_status_not_written(self, mock_db, mock_run): mock_db.get_active_jobs.return_value = [job] # Worker returns RUNNING (lowercase from cluster_api) — same as DB - mock_run.return_value = { + mock_dispatch.return_value = { "jobs": { "1001": { "status": "running", @@ -80,13 +80,13 @@ def test_same_status_not_written(self, mock_db, mock_run): }, } - _poll_jobs(settings) + asyncio.run(_poll_jobs(settings)) mock_db.update_job_status.assert_not_called() - @patch("fileglancer.apps.core._run_as_user") + @patch("fileglancer.apps.core._dispatch", new_callable=AsyncMock) @patch("fileglancer.apps.core.db") - def test_changed_status_is_written(self, mock_db, mock_run): + def test_changed_status_is_written(self, mock_db, mock_dispatch): settings = _make_settings() job = _make_db_job(1, "1001", "RUNNING") @@ -96,7 +96,7 @@ def test_changed_status_is_written(self, mock_db, mock_run): mock_db.get_active_jobs.return_value = [job] # Worker returns DONE — different from DB's RUNNING - mock_run.return_value = { + mock_dispatch.return_value = { "jobs": { "1001": { "status": "done", @@ -108,15 +108,15 @@ def test_changed_status_is_written(self, mock_db, mock_run): }, } - _poll_jobs(settings) + asyncio.run(_poll_jobs(settings)) mock_db.update_job_status.assert_called_once() args, kwargs = mock_db.update_job_status.call_args assert args == (mock_session, 1, "DONE") - @patch("fileglancer.apps.core._run_as_user") + @patch("fileglancer.apps.core._dispatch", new_callable=AsyncMock) @patch("fileglancer.apps.core.db") - def test_job_statuses_passed_to_worker(self, mock_db, mock_run): + def test_job_statuses_passed_to_worker(self, mock_db, mock_dispatch): """The poll request must include job_statuses so the worker seeds stubs with the correct status instead of defaulting to PENDING.""" settings = _make_settings() @@ -130,12 +130,13 @@ def test_job_statuses_passed_to_worker(self, mock_db, mock_run): mock_db.get_db_session.return_value.__exit__ = MagicMock(return_value=False) mock_db.get_active_jobs.return_value = jobs - mock_run.return_value = {"jobs": {}} + mock_dispatch.return_value = {"jobs": {}} - _poll_jobs(settings) + asyncio.run(_poll_jobs(settings)) - request = mock_run.call_args[0][1] - assert request["job_statuses"] == {"1001": "RUNNING", "1002": "PENDING"} + # _dispatch is called as (username, action, **kwargs) + kwargs = mock_dispatch.call_args.kwargs + assert kwargs["job_statuses"] == {"1001": "RUNNING", "1002": "PENDING"} # --------------------------------------------------------------------------- @@ -365,10 +366,10 @@ def test_poll_jobs_routes_local_executor(self, tmp_path): mock_db.get_db_session.return_value.__exit__ = MagicMock(return_value=False) mock_db.get_active_jobs.return_value = [job] - result = _poll_jobs(settings) + result = asyncio.run(_poll_jobs(settings)) assert result is True - # Should NOT have called _run_as_user (cluster-based polling) + # Should NOT have dispatched a cluster poll mock_db.update_job_status.assert_called_once() finally: proc.terminate() diff --git a/tests/test_worker.py b/tests/test_worker.py index 2cc294dd..d0613c54 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -178,6 +178,7 @@ def _make_worker_pair(self): # Create a fake Popen-like object class FakeProcess: returncode = None + pid = 12345 def poll(self): return None def wait(self): pass def kill(self): pass @@ -273,6 +274,7 @@ def _make_worker_pair(self): class FakeProcess: returncode = None + pid = 12345 def poll(self): return None def wait(self): pass def kill(self): pass From 9c905f707bfe050a004521d2329556bd6323a057 Mon Sep 17 00:00:00 2001 From: Allison Truhlar Date: Mon, 11 May 2026 16:31:43 -0400 Subject: [PATCH 38/39] fix: re-export set_worker_exec from apps/__init__.py --- fileglancer/apps/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fileglancer/apps/__init__.py b/fileglancer/apps/__init__.py index 649f0749..b101295e 100644 --- a/fileglancer/apps/__init__.py +++ b/fileglancer/apps/__init__.py @@ -18,6 +18,7 @@ stop_job_monitor, submit_job, merge_requirements, + set_worker_exec, validate_path_for_shell, validate_path_in_filestore, verify_requirements, From 152ddbbceb30b820e2eabff18677121f386fec0f Mon Sep 17 00:00:00 2001 From: Allison Truhlar Date: Tue, 12 May 2026 09:16:01 -0400 Subject: [PATCH 39/39] chore: create alpha version for test release from branch --- frontend/package-lock.json | 4 ++-- frontend/package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 85cacddf..cad34def 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -1,12 +1,12 @@ { "name": "fileglancer", - "version": "2.8.1", + "version": "2.8.2-a0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "fileglancer", - "version": "2.8.1", + "version": "2.8.2-a0", "license": "BSD-3-Clause", "dependencies": { "@material-tailwind/react": "^3.0.0-beta.24", diff --git a/frontend/package.json b/frontend/package.json index e40bd05a..0238a1c3 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,7 +1,7 @@ { "name": "fileglancer", "type": "module", - "version": "2.8.1", + "version": "2.8.2-a0", "description": "Browse, share, and publish files on the Janelia file system", "keywords": [ "ngff",