Skip to content

Commit d4d00af

Browse files
committed
Add URL parser module for detecting source type from URLs
- Parse GitHub URLs (owner/repo, tree/branch, commit/sha) - Parse GitLab URLs (project path, subgroups, /-/tree/branch) - Parse Bitbucket URLs (workspace/repo, src/branch, branch/name) - Fallback to website source for unknown URLs - Extract default index names from URLs - Support self-hosted GitLab and Bitbucket instances - Export parseSourceUrl and ParsedUrl from @augmentcode/context-connectors/core - Add comprehensive unit tests (19 test cases) Agent-Id: agent-8394bd07-7a81-41d0-ac95-1ca62623e6fb
1 parent 0805159 commit d4d00af

3 files changed

Lines changed: 337 additions & 0 deletions

File tree

src/core/index.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,6 @@ export { sanitizeKey, isoTimestamp } from "./utils.js";
2525
export { Indexer } from "./indexer.js";
2626
export type { IndexerConfig } from "./indexer.js";
2727

28+
export { parseSourceUrl } from "./url-parser.js";
29+
export type { ParsedUrl } from "./url-parser.js";
30+

src/core/url-parser.test.ts

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
import { describe, it, expect } from "vitest";
2+
import { parseSourceUrl } from "./url-parser.js";
3+
4+
describe("parseSourceUrl", () => {
5+
describe("GitHub URLs", () => {
6+
it("parses basic github.com URL", () => {
7+
const result = parseSourceUrl("https://github.com/owner/repo");
8+
expect(result.type).toBe("github");
9+
expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "HEAD" });
10+
expect(result.defaultIndexName).toBe("repo");
11+
});
12+
13+
it("parses GitHub URL with tree/branch", () => {
14+
const result = parseSourceUrl("https://github.com/owner/repo/tree/main");
15+
expect(result.type).toBe("github");
16+
expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "main" });
17+
expect(result.defaultIndexName).toBe("repo");
18+
});
19+
20+
it("parses GitHub URL with tree/feature/branch (slashes in branch name)", () => {
21+
const result = parseSourceUrl("https://github.com/owner/repo/tree/feature/branch");
22+
expect(result.type).toBe("github");
23+
expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "feature/branch" });
24+
expect(result.defaultIndexName).toBe("repo");
25+
});
26+
27+
it("parses GitHub URL with commit SHA", () => {
28+
const result = parseSourceUrl("https://github.com/owner/repo/commit/abc123def456");
29+
expect(result.type).toBe("github");
30+
expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "abc123def456" });
31+
expect(result.defaultIndexName).toBe("repo");
32+
});
33+
34+
it("throws on invalid GitHub URL without repo", () => {
35+
expect(() => parseSourceUrl("https://github.com/owner")).toThrow("Invalid GitHub URL");
36+
});
37+
});
38+
39+
describe("GitLab URLs", () => {
40+
it("parses basic gitlab.com URL", () => {
41+
const result = parseSourceUrl("https://gitlab.com/group/project");
42+
expect(result.type).toBe("gitlab");
43+
expect(result.config).toEqual({ projectId: "group/project", ref: "HEAD", baseUrl: undefined });
44+
expect(result.defaultIndexName).toBe("project");
45+
});
46+
47+
it("parses GitLab URL with subgroups", () => {
48+
const result = parseSourceUrl("https://gitlab.com/group/subgroup/project");
49+
expect(result.type).toBe("gitlab");
50+
expect(result.config).toEqual({
51+
projectId: "group/subgroup/project",
52+
ref: "HEAD",
53+
baseUrl: undefined,
54+
});
55+
expect(result.defaultIndexName).toBe("project");
56+
});
57+
58+
it("parses GitLab URL with /-/tree/branch", () => {
59+
const result = parseSourceUrl("https://gitlab.com/group/project/-/tree/main");
60+
expect(result.type).toBe("gitlab");
61+
expect(result.config).toEqual({ projectId: "group/project", ref: "main", baseUrl: undefined });
62+
expect(result.defaultIndexName).toBe("project");
63+
});
64+
65+
it("parses GitLab URL with /-/tree/feature/branch", () => {
66+
const result = parseSourceUrl("https://gitlab.com/group/project/-/tree/feature/branch");
67+
expect(result.type).toBe("gitlab");
68+
expect(result.config).toEqual({
69+
projectId: "group/project",
70+
ref: "feature/branch",
71+
baseUrl: undefined,
72+
});
73+
});
74+
75+
it("parses self-hosted GitLab URL", () => {
76+
const result = parseSourceUrl("https://gitlab.mycompany.com/team/project");
77+
expect(result.type).toBe("gitlab");
78+
expect(result.config).toEqual({
79+
projectId: "team/project",
80+
ref: "HEAD",
81+
baseUrl: "https://gitlab.mycompany.com",
82+
});
83+
expect(result.defaultIndexName).toBe("project");
84+
});
85+
86+
it("throws on invalid GitLab URL", () => {
87+
expect(() => parseSourceUrl("https://gitlab.com/group")).toThrow("Invalid GitLab URL");
88+
});
89+
});
90+
91+
describe("Bitbucket URLs", () => {
92+
it("parses basic bitbucket.org URL", () => {
93+
const result = parseSourceUrl("https://bitbucket.org/workspace/repo");
94+
expect(result.type).toBe("bitbucket");
95+
expect(result.config).toEqual({
96+
workspace: "workspace",
97+
repo: "repo",
98+
ref: "HEAD",
99+
baseUrl: undefined,
100+
});
101+
expect(result.defaultIndexName).toBe("repo");
102+
});
103+
104+
it("parses Bitbucket URL with /src/branch", () => {
105+
const result = parseSourceUrl("https://bitbucket.org/workspace/repo/src/main");
106+
expect(result.type).toBe("bitbucket");
107+
expect(result.config).toEqual({
108+
workspace: "workspace",
109+
repo: "repo",
110+
ref: "main",
111+
baseUrl: undefined,
112+
});
113+
});
114+
115+
it("parses Bitbucket URL with /branch/feature", () => {
116+
const result = parseSourceUrl("https://bitbucket.org/workspace/repo/branch/feature");
117+
expect(result.type).toBe("bitbucket");
118+
expect(result.config).toEqual({
119+
workspace: "workspace",
120+
repo: "repo",
121+
ref: "feature",
122+
baseUrl: undefined,
123+
});
124+
});
125+
126+
it("parses self-hosted Bitbucket URL", () => {
127+
const result = parseSourceUrl("https://bitbucket.mycompany.com/workspace/repo");
128+
expect(result.type).toBe("bitbucket");
129+
expect(result.config).toEqual({
130+
workspace: "workspace",
131+
repo: "repo",
132+
ref: "HEAD",
133+
baseUrl: "https://bitbucket.mycompany.com",
134+
});
135+
});
136+
137+
it("throws on invalid Bitbucket URL", () => {
138+
expect(() => parseSourceUrl("https://bitbucket.org/workspace")).toThrow("Invalid Bitbucket URL");
139+
});
140+
});
141+
142+
describe("Website URLs (fallback)", () => {
143+
it("parses unknown URL as website", () => {
144+
const result = parseSourceUrl("https://docs.example.com/api/v2");
145+
expect(result.type).toBe("website");
146+
expect(result.config).toEqual({ url: "https://docs.example.com/api/v2" });
147+
expect(result.defaultIndexName).toBe("docs.example.com");
148+
});
149+
150+
it("uses hostname as default index name for website", () => {
151+
const result = parseSourceUrl("https://react.dev/learn/thinking-in-react");
152+
expect(result.type).toBe("website");
153+
expect(result.defaultIndexName).toBe("react.dev");
154+
});
155+
});
156+
157+
describe("Invalid URLs", () => {
158+
it("throws on invalid URL format", () => {
159+
expect(() => parseSourceUrl("not-a-url")).toThrow();
160+
});
161+
});
162+
});
163+

src/core/url-parser.ts

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
/**
2+
* URL Parser - Parses source URLs to determine type and extract configuration
3+
*
4+
* @module core/url-parser
5+
*/
6+
7+
import type { GitHubSourceConfig } from "../sources/github.js";
8+
import type { GitLabSourceConfig } from "../sources/gitlab.js";
9+
import type { BitBucketSourceConfig } from "../sources/bitbucket.js";
10+
import type { WebsiteSourceConfig } from "../sources/website.js";
11+
12+
/**
13+
* Result of parsing a source URL
14+
*/
15+
export interface ParsedUrl {
16+
type: "github" | "gitlab" | "bitbucket" | "website";
17+
config: GitHubSourceConfig | GitLabSourceConfig | BitBucketSourceConfig | WebsiteSourceConfig;
18+
defaultIndexName: string;
19+
}
20+
21+
/**
22+
* Parse a source URL to determine the source type and extract configuration.
23+
*
24+
* @param urlString - The URL to parse
25+
* @returns Parsed URL with type, config, and default index name
26+
* @throws Error if the URL is invalid
27+
*
28+
* @example
29+
* ```typescript
30+
* const result = parseSourceUrl("https://github.com/owner/repo/tree/main");
31+
* // result.type === "github"
32+
* // result.config === { owner: "owner", repo: "repo", ref: "main" }
33+
* // result.defaultIndexName === "repo"
34+
* ```
35+
*/
36+
export function parseSourceUrl(urlString: string): ParsedUrl {
37+
const url = new URL(urlString);
38+
const hostname = url.hostname.toLowerCase();
39+
40+
// GitHub
41+
if (hostname === "github.com") {
42+
return parseGitHubUrl(url);
43+
}
44+
45+
// GitLab (gitlab.com or hostname contains "gitlab")
46+
if (hostname === "gitlab.com" || hostname.includes("gitlab")) {
47+
return parseGitLabUrl(url);
48+
}
49+
50+
// Bitbucket (bitbucket.org or hostname contains "bitbucket")
51+
if (hostname === "bitbucket.org" || hostname.includes("bitbucket")) {
52+
return parseBitBucketUrl(url);
53+
}
54+
55+
// Fallback to website
56+
return {
57+
type: "website",
58+
config: { url: urlString },
59+
defaultIndexName: hostname,
60+
};
61+
}
62+
63+
/**
64+
* Parse a GitHub URL
65+
* Formats:
66+
* - https://github.com/owner/repo
67+
* - https://github.com/owner/repo/tree/branch
68+
* - https://github.com/owner/repo/tree/feature/branch
69+
* - https://github.com/owner/repo/commit/sha
70+
*/
71+
function parseGitHubUrl(url: URL): ParsedUrl {
72+
const pathParts = url.pathname.split("/").filter(Boolean);
73+
74+
if (pathParts.length < 2) {
75+
throw new Error(`Invalid GitHub URL: ${url.href} - expected owner and repo in path`);
76+
}
77+
78+
const owner = pathParts[0];
79+
const repo = pathParts[1];
80+
let ref = "HEAD";
81+
82+
// Check for tree/branch or commit/sha patterns
83+
if (pathParts.length >= 4) {
84+
if (pathParts[2] === "tree" || pathParts[2] === "commit") {
85+
// Join all remaining parts to handle branch names with slashes
86+
ref = pathParts.slice(3).join("/");
87+
}
88+
}
89+
90+
return {
91+
type: "github",
92+
config: { owner, repo, ref },
93+
defaultIndexName: repo,
94+
};
95+
}
96+
97+
/**
98+
* Parse a GitLab URL
99+
* Formats:
100+
* - https://gitlab.com/group/project
101+
* - https://gitlab.com/group/subgroup/project
102+
* - https://gitlab.com/group/project/-/tree/branch
103+
*/
104+
function parseGitLabUrl(url: URL): ParsedUrl {
105+
const pathParts = url.pathname.split("/").filter(Boolean);
106+
107+
if (pathParts.length < 2) {
108+
throw new Error(`Invalid GitLab URL: ${url.href} - expected project path`);
109+
}
110+
111+
let ref = "HEAD";
112+
let projectParts = pathParts;
113+
114+
// Check for /-/tree/branch pattern
115+
const dashIndex = pathParts.indexOf("-");
116+
if (dashIndex !== -1) {
117+
projectParts = pathParts.slice(0, dashIndex);
118+
// After "-", expect "tree" or "commits" followed by ref
119+
if (pathParts[dashIndex + 1] === "tree" || pathParts[dashIndex + 1] === "commits") {
120+
ref = pathParts.slice(dashIndex + 2).join("/");
121+
}
122+
}
123+
124+
const projectId = projectParts.join("/");
125+
const projectName = projectParts[projectParts.length - 1];
126+
127+
// Handle self-hosted GitLab
128+
const baseUrl = url.origin !== "https://gitlab.com" ? url.origin : undefined;
129+
130+
return {
131+
type: "gitlab",
132+
config: { projectId, ref, baseUrl },
133+
defaultIndexName: projectName,
134+
};
135+
}
136+
137+
/**
138+
* Parse a Bitbucket URL
139+
* Formats:
140+
* - https://bitbucket.org/workspace/repo
141+
* - https://bitbucket.org/workspace/repo/src/branch
142+
* - https://bitbucket.org/workspace/repo/branch/feature
143+
*/
144+
function parseBitBucketUrl(url: URL): ParsedUrl {
145+
const pathParts = url.pathname.split("/").filter(Boolean);
146+
147+
if (pathParts.length < 2) {
148+
throw new Error(`Invalid Bitbucket URL: ${url.href} - expected workspace and repo in path`);
149+
}
150+
151+
const workspace = pathParts[0];
152+
const repo = pathParts[1];
153+
let ref = "HEAD";
154+
155+
// Check for /src/branch or /branch/name patterns
156+
if (pathParts.length >= 4) {
157+
if (pathParts[2] === "src" || pathParts[2] === "branch") {
158+
ref = pathParts.slice(3).join("/");
159+
}
160+
}
161+
162+
// Handle self-hosted Bitbucket
163+
const baseUrl = url.origin !== "https://bitbucket.org" ? url.origin : undefined;
164+
165+
return {
166+
type: "bitbucket",
167+
config: { workspace, repo, ref, baseUrl },
168+
defaultIndexName: repo,
169+
};
170+
}
171+

0 commit comments

Comments
 (0)