From e2b5405d3855e3b74c08be7636e9b1c27c95fedd Mon Sep 17 00:00:00 2001 From: KillT Date: Thu, 14 May 2026 18:10:51 +0700 Subject: [PATCH] feat(katana): add katana crawler component --- .husky/prepare-commit-msg | 6 - worker/src/components/index.ts | 1 + .../__tests__/katana-integration.test.ts | 74 +++ .../security/__tests__/katana.test.ts | 120 +++++ worker/src/components/security/katana.ts | 444 ++++++++++++++++++ 5 files changed, 639 insertions(+), 6 deletions(-) delete mode 100755 .husky/prepare-commit-msg create mode 100644 worker/src/components/security/__tests__/katana-integration.test.ts create mode 100644 worker/src/components/security/__tests__/katana.test.ts create mode 100644 worker/src/components/security/katana.ts diff --git a/.husky/prepare-commit-msg b/.husky/prepare-commit-msg deleted file mode 100755 index c699f2f3..00000000 --- a/.husky/prepare-commit-msg +++ /dev/null @@ -1,6 +0,0 @@ -# Automatically add Signed-off-by line for the current committer if not already present -# This ensures DCO compliance even when cherry-picking/rebasing commits with existing signoffs -COMMITTER_SIGNOFF="Signed-off-by: $(git config user.name) <$(git config user.email)>" -if ! grep -qF "$COMMITTER_SIGNOFF" "$1"; then - git interpret-trailers --in-place --trailer "$COMMITTER_SIGNOFF" "$1" -fi diff --git a/worker/src/components/index.ts b/worker/src/components/index.ts index 714c92e2..b75b843b 100644 --- a/worker/src/components/index.ts +++ b/worker/src/components/index.ts @@ -55,6 +55,7 @@ import './security/trufflehog'; import './security/terminal-demo'; import './security/virustotal'; import './security/abuseipdb'; +import './security/katana'; import './security/aws-mcp-group'; // GitHub components diff --git a/worker/src/components/security/__tests__/katana-integration.test.ts b/worker/src/components/security/__tests__/katana-integration.test.ts new file mode 100644 index 00000000..6a5bf964 --- /dev/null +++ b/worker/src/components/security/__tests__/katana-integration.test.ts @@ -0,0 +1,74 @@ +/** + * Integration test for Katana component with real Docker execution + * Requires Docker daemon to be running + */ +import { describe, test, expect, beforeEach } from 'bun:test'; +import { componentRegistry, createExecutionContext } from '@shipsec/component-sdk'; +import type { ExecutionContext } from '@shipsec/component-sdk'; +import type { KatanaInput, KatanaOutput } from '../katana'; +import '../katana'; // Register the component + +const enableDockerIntegration = process.env.ENABLE_DOCKER_TESTS === 'true'; +const dockerAvailable = (() => { + try { + const result = Bun.spawnSync(['docker', 'version']); + return result.exitCode === 0; + } catch { + return false; + } +})(); + +const shouldRunDockerTests = enableDockerIntegration && dockerAvailable; +if (!shouldRunDockerTests) { + console.warn( + 'Skipping katana integration tests. Ensure ENABLE_DOCKER_TESTS=true and Docker is available to enable.', + ); +} + +const dockerDescribe = shouldRunDockerTests ? describe : describe.skip; + +dockerDescribe('Katana Integration (Docker)', () => { + let context: ExecutionContext; + const logs: string[] = []; + + beforeEach(() => { + logs.length = 0; + context = createExecutionContext({ + runId: 'test-run', + componentRef: 'shipsec.katana.crawl', + logCollector: (entry) => { + logs.push(`${entry.stream.toUpperCase()}: ${entry.message}`); + }, + }); + }); + + test('should crawl urls for a known domain using real katana', async () => { + const component = componentRegistry.get('shipsec.katana.crawl')!; + expect(component).toBeDefined(); + + const result = await component.execute( + { + inputs: { urls: ['https://example.com'] }, + params: { depth: 1 }, + }, + context, + ); + + console.log('Katana result:', result); + + // Verify output structure + expect(result).toHaveProperty('crawledUrls'); + expect(result).toHaveProperty('rawOutput'); + expect(result).toHaveProperty('targetCount'); + expect(result).toHaveProperty('urlCount'); + expect(Array.isArray(result.crawledUrls)).toBe(true); + expect(typeof result.rawOutput).toBe('string'); + expect(typeof result.targetCount).toBe('number'); + expect(typeof result.urlCount).toBe('number'); + + expect(result.targetCount).toBe(1); + + // Check logs + expect(logs.some((log) => log.includes('katana'))).toBe(true); + }, 120000); +}); diff --git a/worker/src/components/security/__tests__/katana.test.ts b/worker/src/components/security/__tests__/katana.test.ts new file mode 100644 index 00000000..6ab160f2 --- /dev/null +++ b/worker/src/components/security/__tests__/katana.test.ts @@ -0,0 +1,120 @@ +import { describe, it, expect, beforeAll, afterEach, vi, mock } from 'bun:test'; +import * as sdk from '@shipsec/component-sdk'; +import type { KatanaInput, KatanaOutput } from '../katana'; + +mock.module('../../../utils/isolated-volume', () => ({ + IsolatedContainerVolume: class { + async initialize() { + return 'mock-volume'; + } + getVolumeConfig(containerPath = '/inputs', readOnly = true) { + return { source: 'mock-volume', target: containerPath, readOnly }; + } + async cleanup() {} + }, +})); + +let componentRegistry: typeof import('@shipsec/component-sdk').componentRegistry; + +describe('katana component', () => { + beforeAll(async () => { + ({ componentRegistry } = await import('../../index')); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('should be registered', () => { + const component = componentRegistry.get('shipsec.katana.crawl'); + expect(component).toBeDefined(); + expect(component!.label).toBe('Katana Web Crawler'); + expect(component!.category).toBe('security'); + }); + + it('should normalise raw output returned as plain text', async () => { + const component = componentRegistry.get('shipsec.katana.crawl'); + if (!component) throw new Error('Component not registered'); + + const context = sdk.createExecutionContext({ + runId: 'test-run', + componentRef: 'katana-test', + }); + + const executePayload = { + inputs: { + urls: ['https://example.com'], + }, + params: {}, + }; + + vi.spyOn(sdk, 'runComponentWithRunner').mockResolvedValue( + 'https://example.com/login\nhttps://example.com/dashboard', + ); + + const result = component.outputs.parse(await component.execute(executePayload, context)); + + expect(result.crawledUrls).toEqual([ + 'https://example.com/login', + 'https://example.com/dashboard', + ]); + expect(result.rawOutput).toBe('https://example.com/login\nhttps://example.com/dashboard'); + expect(result.targetCount).toBe(1); + expect(result.urlCount).toBe(2); + }); + + it('should accept legacy "urls" string array and normalise', () => { + const component = componentRegistry.get('shipsec.katana.crawl'); + if (!component) throw new Error('Component not registered'); + + const inputs = component.inputs.parse({ urls: 'https://example.com' }); + expect(inputs.urls).toEqual(['https://example.com']); + }); + + it('should pass JS crawl and Headless flags when configured', async () => { + const component = componentRegistry.get('shipsec.katana.crawl'); + if (!component) throw new Error('Component not registered'); + + const context = sdk.createExecutionContext({ + runId: 'test-run', + componentRef: 'katana-test', + }); + + const executePayload = { + inputs: { + urls: ['https://example.com'], + }, + params: { + jsCrawl: true, + headless: true, + depth: 2, + }, + }; + + const runnerSpy = vi.spyOn(sdk, 'runComponentWithRunner').mockResolvedValue(''); + + await component.execute(executePayload, context); + + expect(runnerSpy).toHaveBeenCalled(); + const [runnerConfig] = runnerSpy.mock.calls[0]; + expect(runnerConfig).toBeDefined(); + if (runnerConfig && runnerConfig.kind === 'docker') { + const command = runnerConfig.command ?? []; + expect(command).toContain('-jc'); + expect(command).toContain('-hl'); + expect(command).toContain('-d'); + const dIndex = command.indexOf('-d'); + expect(command[dIndex + 1]).toBe('2'); + } + }); + + it('should use docker runner config', () => { + const component = componentRegistry.get('shipsec.katana.crawl'); + if (!component) throw new Error('Component not registered'); + + expect(component.runner.kind).toBe('docker'); + if (component.runner.kind === 'docker') { + expect(component.runner.image).toBe('ghcr.io/shipsecai/katana:latest'); + } + }); +}); diff --git a/worker/src/components/security/katana.ts b/worker/src/components/security/katana.ts new file mode 100644 index 00000000..4fb10934 --- /dev/null +++ b/worker/src/components/security/katana.ts @@ -0,0 +1,444 @@ +import { z } from 'zod'; +import { + componentRegistry, + runComponentWithRunner, + type DockerRunnerConfig, + ContainerError, + ComponentRetryPolicy, + defineComponent, + inputs, + outputs, + parameters, + port, + param, + generateFindingHash, + analyticsResultSchema, + type AnalyticsResult, +} from '@shipsec/component-sdk'; +import { IsolatedContainerVolume } from '../../utils/isolated-volume'; + +const KATANA_IMAGE = 'ghcr.io/shipsecai/katana:latest'; +const KATANA_TIMEOUT_SECONDS = 1800; // 30 minutes +const INPUT_MOUNT_NAME = 'inputs'; +const CONTAINER_INPUT_DIR = `/${INPUT_MOUNT_NAME}`; +const TARGET_FILE_NAME = 'targets.txt'; +const HEADERS_FILE_NAME = 'headers.txt'; + +const targetValueSchema = z.preprocess( + (val) => (typeof val === 'string' ? [val] : val), + z.array(z.string().min(1)), +); + +const inputSchema = inputs({ + urls: port(targetValueSchema.describe('Array of target URLs'), { + label: 'Target URLs', + description: 'Array of URLs to crawl.', + connectionType: { kind: 'list', element: { kind: 'primitive', name: 'text' } }, + }), + headers: port(z.array(z.string()).optional().describe('Custom headers to send with requests'), { + label: 'Custom Headers', + description: 'Array of custom headers (e.g. "Authorization: Bearer token").', + connectionType: { kind: 'list', element: { kind: 'primitive', name: 'text' } }, + }), +}); + +const parameterSchema = parameters({ + depth: param(z.number().int().min(1).max(10).default(3), { + label: 'Crawl Depth', + editor: 'number', + min: 1, + max: 10, + description: 'Maximum depth to crawl.', + }), + jsCrawl: param(z.boolean().default(false), { + label: 'JS Crawl', + editor: 'boolean', + description: 'Enable JS parsing and crawling.', + }), + headless: param(z.boolean().default(false), { + label: 'Headless Mode', + editor: 'boolean', + description: 'Enable headless browser for crawling.', + }), + threads: param(z.number().int().min(1).max(100).default(10), { + label: 'Threads', + editor: 'number', + min: 1, + max: 100, + description: 'Number of concurrent threads for crawling.', + }), + timeout: param(z.number().int().min(1).max(300).default(10), { + label: 'Timeout (seconds)', + editor: 'number', + min: 1, + max: 300, + description: 'Timeout for requests in seconds.', + }), + customFlags: param( + z.string().trim().optional().describe('Raw CLI flags to append to the katana command'), + { + label: 'Custom CLI Flags', + editor: 'textarea', + rows: 3, + placeholder: '-jc -hl', + description: 'Paste additional katana CLI options exactly as you would on the command line.', + helpText: 'Flags are appended after the generated options.', + }, + ), +}); + +const outputSchema = outputs({ + crawledUrls: port(z.array(z.string()), { + label: 'Crawled URLs', + description: 'Array of all URLs discovered during crawling.', + }), + rawOutput: port(z.string(), { + label: 'Raw Output', + description: 'Raw tool output for debugging.', + }), + targetCount: port(z.number(), { + label: 'Target Count', + description: 'Number of targets scanned.', + }), + urlCount: port(z.number(), { + label: 'URL Count', + description: 'Number of URLs discovered.', + }), + results: port(z.array(analyticsResultSchema()), { + label: 'Results', + description: + 'Analytics-ready findings with scanner, finding_hash, and severity. Connect to Analytics Sink.', + }), +}); + +// Split custom CLI flags into an array of arguments +const splitCliArgs = (input: string): string[] => { + const args: string[] = []; + let current = ''; + let quote: '"' | "'" | null = null; + let escape = false; + + for (const ch of input) { + if (escape) { + current += ch; + escape = false; + continue; + } + + if (ch === '\\') { + escape = true; + continue; + } + + if (quote) { + if (ch === quote) { + quote = null; + } else { + current += ch; + } + continue; + } + + if (ch === '"' || ch === "'") { + quote = ch as '"' | "'"; + continue; + } + + if (/\s/.test(ch)) { + if (current.length > 0) { + args.push(current); + current = ''; + } + continue; + } + + current += ch; + } + + if (current.length > 0) { + args.push(current); + } + + return args; +}; + +interface BuildKatanaArgsOptions { + targetFile: string; + headersFile?: string; + depth?: number; + jsCrawl: boolean; + headless: boolean; + threads?: number; + timeout?: number; + customFlags: string[]; +} + +const buildKatanaArgs = (options: BuildKatanaArgsOptions): string[] => { + const args: string[] = []; + + // Always use silent mode for clean output + args.push('-silent'); + + // Target list file input + args.push('-list', options.targetFile); + + // Headers file + if (options.headersFile) { + args.push('-H', options.headersFile); + } + + // Depth + if (typeof options.depth === 'number' && options.depth >= 1) { + args.push('-d', String(options.depth)); + } + + // Thread count + if (typeof options.threads === 'number' && options.threads >= 1) { + args.push('-c', String(options.threads)); + } + + // Timeout + if (typeof options.timeout === 'number' && options.timeout >= 1) { + args.push('-timeout', String(options.timeout)); + } + + // JS Crawl + if (options.jsCrawl) { + args.push('-jc'); + } + + // Headless mode + if (options.headless) { + args.push('-hl'); + } + + // Custom flags + for (const flag of options.customFlags) { + if (flag.length > 0) { + args.push(flag); + } + } + + return args; +}; + +const katanaRetryPolicy: ComponentRetryPolicy = { + maxAttempts: 2, + initialIntervalSeconds: 5, + maximumIntervalSeconds: 30, + backoffCoefficient: 2.0, + nonRetryableErrorTypes: ['ContainerError', 'ValidationError', 'ConfigurationError'], +}; + +const definition = defineComponent({ + id: 'shipsec.katana.crawl', + label: 'Katana Web Crawler', + category: 'security', + retryPolicy: katanaRetryPolicy, + runner: { + kind: 'docker', + image: KATANA_IMAGE, + network: 'bridge', + timeoutSeconds: KATANA_TIMEOUT_SECONDS, + env: { + HOME: '/tmp', + }, + command: [], + }, + inputs: inputSchema, + outputs: outputSchema, + parameters: parameterSchema, + docs: 'Runs ProjectDiscovery katana to crawl targets and discover URLs.', + ui: { + slug: 'katana', + version: '1.0.0', + type: 'scan', + category: 'security', + description: 'A next-generation crawling and spidering framework.', + documentation: 'ProjectDiscovery Katana documentation details configuration and usage.', + documentationUrl: 'https://github.com/projectdiscovery/katana', + icon: 'Spider', + author: { + name: 'ShipSecAI', + type: 'shipsecai', + }, + isLatest: true, + deprecated: false, + example: '`katana -list targets.txt -silent`', + examples: [ + 'Crawl discovered subdomains to find endpoints.', + 'Use headless mode for SPA crawling.', + ], + }, + toolProvider: { + kind: 'component', + name: 'web_crawler', + description: 'Web crawling and spidering framework (Katana).', + }, + async execute({ inputs, params }, context) { + const parsedParams = parameterSchema.parse(params); + const { depth, jsCrawl, headless, threads, timeout, customFlags } = parsedParams; + + const trimmedCustomFlags = + typeof customFlags === 'string' && customFlags.length > 0 ? customFlags : null; + const customFlagArgs = trimmedCustomFlags ? splitCliArgs(trimmedCustomFlags) : []; + + const values = new Set(); + const addValue = (value: string | string[] | undefined) => { + if (Array.isArray(value)) { + value.forEach((item) => { + const trimmed = item.trim(); + if (trimmed.length > 0) { + values.add(trimmed); + } + }); + return; + } + if (typeof value === 'string') { + const trimmed = value.trim(); + if (trimmed.length > 0) { + values.add(trimmed); + } + } + }; + + addValue(inputs.urls); + const targets = Array.from(values); + const targetCount = targets.length; + + if (targetCount === 0) { + context.logger.info('[Katana] Skipping execution because no targets were provided.'); + return { + crawledUrls: [], + results: [], + rawOutput: '', + targetCount: 0, + urlCount: 0, + }; + } + + context.logger.info(`[Katana] Crawling ${targetCount} target(s)`); + context.emitProgress({ + message: `Launching Katana for ${targetCount} target(s)`, + level: 'info', + data: { targets }, + }); + + const tenantId = (context as any).tenantId ?? 'default-tenant'; + const volume = new IsolatedContainerVolume(tenantId, context.runId); + + const baseRunner = definition.runner; + if (baseRunner.kind !== 'docker') { + throw new ContainerError('Katana runner is expected to be docker-based.', { + details: { expectedKind: 'docker', actualKind: baseRunner.kind }, + }); + } + + let rawOutput: string; + try { + const inputFiles: Record = { + [TARGET_FILE_NAME]: targets.join('\n'), + }; + + if (inputs.headers && inputs.headers.length > 0) { + inputFiles[HEADERS_FILE_NAME] = inputs.headers.join('\n'); + } + + await volume.initialize(inputFiles); + + const katanaArgs = buildKatanaArgs({ + targetFile: `${CONTAINER_INPUT_DIR}/${TARGET_FILE_NAME}`, + headersFile: + inputs.headers && inputs.headers.length > 0 + ? `${CONTAINER_INPUT_DIR}/${HEADERS_FILE_NAME}` + : undefined, + depth: depth ?? 3, + jsCrawl: jsCrawl ?? false, + headless: headless ?? false, + threads: threads ?? 10, + timeout: timeout ?? 10, + customFlags: customFlagArgs, + }); + + const runnerConfig: DockerRunnerConfig = { + kind: 'docker', + image: baseRunner.image, + network: baseRunner.network, + timeoutSeconds: baseRunner.timeoutSeconds ?? KATANA_TIMEOUT_SECONDS, + env: { ...(baseRunner.env ?? {}) }, + command: [...(baseRunner.command ?? []), ...katanaArgs], + volumes: [volume.getVolumeConfig(CONTAINER_INPUT_DIR, true)], + }; + + try { + const result = await runComponentWithRunner( + runnerConfig, + async () => ({}) as Output, + { targets }, + context, + ); + + if (typeof result === 'string') { + rawOutput = result; + } else if (result && typeof result === 'object' && 'rawOutput' in result) { + rawOutput = String((result as any).rawOutput ?? ''); + } else { + rawOutput = ''; + } + } catch (error) { + if (error instanceof ContainerError) { + const details = (error as any).details as Record | undefined; + const capturedStdout = details?.stdout; + if (typeof capturedStdout === 'string' && capturedStdout.trim().length > 0) { + context.logger.warn( + `[Katana] Container exited non-zero but produced output. Preserving partial results.`, + ); + rawOutput = capturedStdout; + } else { + throw error; + } + } else { + throw error; + } + } + } finally { + await volume.cleanup(); + } + + const lines = rawOutput + .trim() + .split(/\r?\n/) + .map((line) => line.trim()) + .filter((line) => line.length > 0); + + const crawledUrlsSet = new Set(lines); + const crawledUrls = Array.from(crawledUrlsSet); + const urlCount = crawledUrls.length; + + context.logger.info(`[Katana] Discovered ${urlCount} URLs across ${targetCount} target(s)`); + + const analyticsResults: AnalyticsResult[] = crawledUrls.map((url) => ({ + scanner: 'katana', + finding_hash: generateFindingHash('crawled-url', url, targets.join(',')), + severity: 'info' as const, + asset_key: url, + url, + parent_targets: targets, + })); + + return { + crawledUrls, + rawOutput, + targetCount, + urlCount, + results: analyticsResults, + }; + }, +}); + +componentRegistry.register(definition); + +type Output = (typeof outputSchema)['__inferred']; + +export type KatanaInput = typeof inputSchema; +export type KatanaOutput = typeof outputSchema;