Skip to content

Commit b6a9db6

Browse files
committed
Use 4-tier system to store package metadata
- The 4-tiers are super large, large, medium, and small, which correspond to 1024, 128, 32, and 1 repository, respectively Signed-off-by: Keshav Priyadarshi <git@keshav.space>
1 parent bcf02ac commit b6a9db6

3 files changed

Lines changed: 42 additions & 14 deletions

File tree

aboutcode/hashid/__init__.py

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,12 @@ def get_package_base_dir(purl: Union[PackageURL, str]):
140140
"""
141141
Return the base path to a Package directory (ignoring version) for a purl
142142
"""
143+
if isinstance(purl, str):
144+
purl = PackageURL.from_string(purl)
145+
143146
path_elements = package_path_elements(purl)
144147
phash, core_path, _pversion, _extra_path = path_elements
145-
return Path(f"{PACKAGE_REPOS_NAME_PREFIX}-{phash}") / core_path
148+
return Path(f"{PACKAGE_REPOS_NAME_PREFIX}-{purl.type}-{phash}") / core_path
146149

147150

148151
def get_package_purls_yml_file_path(purl: Union[PackageURL, str]):
@@ -208,15 +211,15 @@ def package_path_elements(purl: Union[PackageURL, str]):
208211
We keep the same prefix for different versions::
209212
210213
>>> package_path_elements("pkg:pypi/license_expression@30.3.1")
211-
('1050', 'pypi/license-expression', '30.3.1', '')
214+
('50', 'pypi/license-expression', '30.3.1', '')
212215
>>> package_path_elements("pkg:pypi/license_expression@10.3.1")
213-
('1050', 'pypi/license-expression', '10.3.1', '')
216+
('50', 'pypi/license-expression', '10.3.1', '')
214217
215218
We encode with quotes, avoid double encoding of already quoted parts to make subpaths easier
216219
for filesystems::
217220
218221
>>> package_path_elements("pkg:pypi/license_expression@30.3.1?foo=bar&baz=bar#sub/path")
219-
('1050', 'pypi/license-expression', '30.3.1', 'baz%3Dbar%26foo%3Dbar%23sub%2Fpath')
222+
('50', 'pypi/license-expression', '30.3.1', 'baz%3Dbar%26foo%3Dbar%23sub%2Fpath')
220223
221224
>>> purl = PackageURL(
222225
... type="pypi",
@@ -225,7 +228,7 @@ def package_path_elements(purl: Union[PackageURL, str]):
225228
... qualifiers=dict(foo="bar"),
226229
... subpath="a/b/c")
227230
>>> package_path_elements(purl)
228-
('1050', 'pypi/license-expression', 'b%23ar%2F%3F30.3.2%21', 'foo%3Dbar%23a%2Fb%2Fc')
231+
('50', 'pypi/license-expression', 'b%23ar%2F%3F30.3.2%21', 'foo%3Dbar%23a%2Fb%2Fc')
229232
"""
230233
if isinstance(purl, str):
231234
purl = PackageURL.from_string(purl)
@@ -287,7 +290,27 @@ def get_core_purl(purl: Union[PackageURL, str]):
287290
return PackageURL(**purld)
288291

289292

290-
def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 13) -> str:
293+
# See https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726
294+
BIT_COUNT_BY_ECOSYSTEM = {
295+
# Super large ecosystem 1024 repos.
296+
"npm": 10,
297+
# Large ecosystem 128 repos.
298+
"pypi": 7,
299+
"maven": 7,
300+
"golang": 7,
301+
"perl": 7,
302+
"ruby": 7,
303+
"nuget": 7,
304+
"php": 7,
305+
# Medium ecosystem 32 repos.
306+
"rpm": 5,
307+
"deb": 5,
308+
# Small ecosystem 1 repo.
309+
"github": 0,
310+
}
311+
312+
313+
def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 0) -> str:
291314
"""
292315
Return a short lower cased hash string from a ``purl`` string or object. The PURL is normalized
293316
and we drop its version, qualifiers and subpath.
@@ -320,30 +343,35 @@ def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 13) -> str:
320343
321344
The hash does not change with version or qualifiers::
322345
>>> get_purl_hash("pkg:pypi/univers@30.12.0")
323-
'1289'
346+
'09'
324347
>>> get_purl_hash("pkg:pypi/univers@10.12.0")
325-
'1289'
348+
'09'
326349
>>> get_purl_hash("pkg:pypi/univers@30.12.0?foo=bar#sub/path")
327-
'1289'
350+
'09'
328351
329352
The hash is left padded with zero if it::
330353
>>> get_purl_hash("pkg:pypi/expressionss")
331-
'0057'
354+
'57'
332355
333356
We normalize the PURL. Here pypi normalization always uses dash for underscore ::
334357
335358
>>> get_purl_hash("pkg:pypi/license_expression")
336-
'1050'
359+
'50'
337360
>>> get_purl_hash("pkg:pypi/license-expression")
338-
'1050'
361+
'50'
339362
340363
Originally from:
341364
https://github.com/nexB/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154
342365
"""
343366

344-
core_purl = get_core_purl(purl).to_string()
367+
core_purl = get_core_purl(purl)
368+
369+
if core_purl.type in BIT_COUNT_BY_ECOSYSTEM:
370+
_bit_count = BIT_COUNT_BY_ECOSYSTEM[core_purl.type]
371+
372+
core_purl_str = core_purl.to_string()
345373
# compute the hash from a UTF-8 encoded string
346-
purl_bytes = core_purl.encode("utf-8")
374+
purl_bytes = core_purl_str.encode("utf-8")
347375
hash_bytes = sha256(purl_bytes).digest()
348376
# ... converted to integer so we can truncate with modulo. Note that we use big endian.
349377
hash_int = int.from_bytes(hash_bytes, "big")

vulnerabilities/tests/test_data/export_command/aboutcode-packages-1ccd/generic/nginx/test/purls.yml renamed to vulnerabilities/tests/test_data/export_command/aboutcode-packages-generic-0/generic/nginx/test/purls.yml

File renamed without changes.

vulnerabilities/tests/test_data/export_command/aboutcode-packages-1ccd/generic/nginx/test/vulnerabilities.yml renamed to vulnerabilities/tests/test_data/export_command/aboutcode-packages-generic-0/generic/nginx/test/vulnerabilities.yml

File renamed without changes.

0 commit comments

Comments
 (0)