@@ -140,9 +140,12 @@ def get_package_base_dir(purl: Union[PackageURL, str]):
140140 """
141141 Return the base path to a Package directory (ignoring version) for a purl
142142 """
143+ if isinstance (purl , str ):
144+ purl = PackageURL .from_string (purl )
145+
143146 path_elements = package_path_elements (purl )
144147 phash , core_path , _pversion , _extra_path = path_elements
145- return Path (f"{ PACKAGE_REPOS_NAME_PREFIX } -{ phash } " ) / core_path
148+ return Path (f"{ PACKAGE_REPOS_NAME_PREFIX } -{ purl . type } - { phash } " ) / core_path
146149
147150
148151def get_package_purls_yml_file_path (purl : Union [PackageURL , str ]):
@@ -208,15 +211,15 @@ def package_path_elements(purl: Union[PackageURL, str]):
208211 We keep the same prefix for different versions::
209212
210213 >>> package_path_elements("pkg:pypi/license_expression@30.3.1")
211- ('1050 ', 'pypi/license-expression', '30.3.1', '')
214+ ('50 ', 'pypi/license-expression', '30.3.1', '')
212215 >>> package_path_elements("pkg:pypi/license_expression@10.3.1")
213- ('1050 ', 'pypi/license-expression', '10.3.1', '')
216+ ('50 ', 'pypi/license-expression', '10.3.1', '')
214217
215218 We encode with quotes, avoid double encoding of already quoted parts to make subpaths easier
216219 for filesystems::
217220
218221 >>> package_path_elements("pkg:pypi/license_expression@30.3.1?foo=bar&baz=bar#sub/path")
219- ('1050 ', 'pypi/license-expression', '30.3.1', 'baz%3Dbar%26foo%3Dbar%23sub%2Fpath')
222+ ('50 ', 'pypi/license-expression', '30.3.1', 'baz%3Dbar%26foo%3Dbar%23sub%2Fpath')
220223
221224 >>> purl = PackageURL(
222225 ... type="pypi",
@@ -225,7 +228,7 @@ def package_path_elements(purl: Union[PackageURL, str]):
225228 ... qualifiers=dict(foo="bar"),
226229 ... subpath="a/b/c")
227230 >>> package_path_elements(purl)
228- ('1050 ', 'pypi/license-expression', 'b%23ar%2F%3F30.3.2%21', 'foo%3Dbar%23a%2Fb%2Fc')
231+ ('50 ', 'pypi/license-expression', 'b%23ar%2F%3F30.3.2%21', 'foo%3Dbar%23a%2Fb%2Fc')
229232 """
230233 if isinstance (purl , str ):
231234 purl = PackageURL .from_string (purl )
@@ -287,7 +290,27 @@ def get_core_purl(purl: Union[PackageURL, str]):
287290 return PackageURL (** purld )
288291
289292
290- def get_purl_hash (purl : Union [PackageURL , str ], _bit_count : int = 13 ) -> str :
293+ # See https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726
294+ BIT_COUNT_BY_ECOSYSTEM = {
295+ # Super large ecosystem 1024 repos.
296+ "npm" : 10 ,
297+ # Large ecosystem 128 repos.
298+ "pypi" : 7 ,
299+ "maven" : 7 ,
300+ "golang" : 7 ,
301+ "perl" : 7 ,
302+ "ruby" : 7 ,
303+ "nuget" : 7 ,
304+ "php" : 7 ,
305+ # Medium ecosystem 32 repos.
306+ "rpm" : 5 ,
307+ "deb" : 5 ,
308+ # Small ecosystem 1 repo.
309+ "github" : 0 ,
310+ }
311+
312+
313+ def get_purl_hash (purl : Union [PackageURL , str ], _bit_count : int = 0 ) -> str :
291314 """
292315 Return a short lower cased hash string from a ``purl`` string or object. The PURL is normalized
293316 and we drop its version, qualifiers and subpath.
@@ -320,30 +343,35 @@ def get_purl_hash(purl: Union[PackageURL, str], _bit_count: int = 13) -> str:
320343
321344 The hash does not change with version or qualifiers::
322345 >>> get_purl_hash("pkg:pypi/univers@30.12.0")
323- '1289 '
346+ '09 '
324347 >>> get_purl_hash("pkg:pypi/univers@10.12.0")
325- '1289 '
348+ '09 '
326349 >>> get_purl_hash("pkg:pypi/univers@30.12.0?foo=bar#sub/path")
327- '1289 '
350+ '09 '
328351
329352 The hash is left padded with zero if it::
330353 >>> get_purl_hash("pkg:pypi/expressionss")
331- '0057 '
354+ '57 '
332355
333356 We normalize the PURL. Here pypi normalization always uses dash for underscore ::
334357
335358 >>> get_purl_hash("pkg:pypi/license_expression")
336- '1050 '
359+ '50 '
337360 >>> get_purl_hash("pkg:pypi/license-expression")
338- '1050 '
361+ '50 '
339362
340363 Originally from:
341364 https://github.com/nexB/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154
342365 """
343366
344- core_purl = get_core_purl (purl ).to_string ()
367+ core_purl = get_core_purl (purl )
368+
369+ if core_purl .type in BIT_COUNT_BY_ECOSYSTEM :
370+ _bit_count = BIT_COUNT_BY_ECOSYSTEM [core_purl .type ]
371+
372+ core_purl_str = core_purl .to_string ()
345373 # compute the hash from a UTF-8 encoded string
346- purl_bytes = core_purl .encode ("utf-8" )
374+ purl_bytes = core_purl_str .encode ("utf-8" )
347375 hash_bytes = sha256 (purl_bytes ).digest ()
348376 # ... converted to integer so we can truncate with modulo. Note that we use big endian.
349377 hash_int = int .from_bytes (hash_bytes , "big" )
0 commit comments