labmlai
diff --git a/‎python_autocomplete/create_dataset.py‎
Lines changed: 167 additions & 20 deletions b/‎python_autocomplete/create_dataset.py‎
Lines changed: 167 additions & 20 deletions
diff --git a/‎python_autocomplete/download.py‎
Lines changed: 0 additions & 76 deletions b/‎python_autocomplete/download.py‎
Lines changed: 0 additions & 76 deletions
diff --git a/‎python_autocomplete/extract_downloads.py‎
Lines changed: 0 additions & 29 deletions b/‎python_autocomplete/extract_downloads.py‎
Lines changed: 0 additions & 29 deletions
diff --git a/‎python_autocomplete/remove_non_source_files.py‎
Lines changed: 0 additions & 28 deletions b/‎python_autocomplete/remove_non_source_files.py‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎readme.md‎
Lines changed: 9 additions & 16 deletions b/‎readme.md‎
Lines changed: 9 additions & 16 deletions
@@ -3,12 +3,21 @@
 """
 Parse all files and write to a single file
 """
+import re
 import string
-from pathlib import Path, PurePath
-from typing import List, NamedTuple
+import urllib.error
+import urllib.request
+import zipfile
+from pathlib import Path
+from pathlib import PurePath
+from typing import List, NamedTuple, Set
+from typing import Optional
 
 import numpy as np
-from labml import logger, monit, lab
+
+from labml import lab, monit
+from labml import logger
+from labml.internal.util import rm_tree
 
 PRINTABLE = set(string.printable)
 
@@ -19,38 +28,40 @@ class PythonFile(NamedTuple):
     path: Path
 
 
-class GetPythonFiles:
+def get_python_files():
     """
     Get list of python files and their paths inside `data/source` folder
     """
 
-    def __init__(self):
-        self.source_path = Path(lab.get_data_path() / 'source')
-        self.files: List[PythonFile] = []
-        self.get_python_files(self.source_path)
-
-        logger.inspect([f.path for f in self.files])
+    source_path = Path(lab.get_data_path() / 'source')
+    files: List[PythonFile] = []
 
-    def add_file(self, path: Path):
+    def _add_file(path: Path):
         """
         Add a file to the list of tiles
         """
-        project = path.relative_to(self.source_path).parents
-        relative_path = path.relative_to(self.source_path / project[len(project) - 3])
+        project = path.relative_to(source_path).parents
+        relative_path = path.relative_to(source_path / project[len(project) - 3])
 
-        self.files.append(PythonFile(relative_path=str(relative_path),
-                                     project=str(project[len(project) - 2]),
-                                     path=path))
+        files.append(PythonFile(relative_path=str(relative_path),
+                                project=str(project[len(project) - 2]),
+                                path=path))
 
-    def get_python_files(self, path: Path):
+    def _collect_python_files(path: Path):
         """
         Recursively collect files
         """
         for p in path.iterdir():
             if p.is_dir():
-                self.get_python_files(p)
+                _collect_python_files(p)
             else:
-                self.add_file(p)
+                _add_file(p)
+
+    _collect_python_files(source_path)
+
+    logger.inspect([f.path for f in files])
+
+    return files
 
 
 def _read_file(path: Path) -> str:
@@ -72,8 +83,144 @@ def _load_code(path: PurePath, source_files: List[PythonFile]):
             f.write(_read_file(source.path) + "\n")
 
 
+def get_repos_from_readme(filename: str):
+    with open(str(lab.get_data_path() / filename), 'r') as f:
+        content = f.read()
+
+    link_pattern = re.compile(r"""
+        \[(?P<title>[^\]]*)\] # title
+        \((?P<utl>[^\)]*)\) # url
+    """, re.VERBOSE)
+
+    res = link_pattern.findall(content)
+
+    github_repos = []
+    repo_pattern = re.compile(r'https://github.com/(?P<user>[^/]*)/(?P<repo>[^/#]*)$')
+    for title, url in res:
+        repos = repo_pattern.findall(url)
+        for r in repos:
+            github_repos.append((r[0], r[1]))
+
+    return github_repos
+
+
+def get_awesome_pytorch_readme():
+    md = urllib.request.urlopen('https://raw.githubusercontent.com/bharathgs/Awesome-pytorch-list/master/README.md')
+    content = md.read()
+
+    with open(str(lab.get_data_path() / 'pytorch_awesome.md'), 'w') as f:
+        f.write(str(content))
+
+
+def download_repo(org: str, repo: str, idx: Optional[int]):
+    zip_file = Path(lab.get_data_path() / 'download' / f'{org}_{repo}.zip')
+
+    if zip_file.exists():
+        return zip_file
+
+    if idx is not None:
+        idx_str = f"{idx:03}: "
+    else:
+        idx_str = ""
+
+    with monit.section(f"{idx_str} {org}/{repo}") as s:
+        try:
+            zip = urllib.request.urlopen(f'https://github.com/{org}/{repo}/archive/master.zip')
+        except urllib.error.HTTPError as e:
+            print(e)
+            return
+        content = zip.read()
+
+        size = len(content) // 1024
+        s.message = f"{size :,}KB"
+
+        with open(str(zip_file), 'wb') as f:
+            f.write(content)
+
+    return zip_file
+
+
+def create_folders():
+    path = Path(lab.get_data_path() / 'download')
+    if not path.exists():
+        path.mkdir(parents=True)
+    source = Path(lab.get_data_path() / 'source')
+
+    if not source.exists():
+        source.mkdir(parents=True)
+
+
+def extract_zip(file_path: Path, overwrite: bool = False):
+    source = Path(lab.get_data_path() / 'source')
+
+    with monit.section(f"Extract {file_path.stem}"):
+        repo_source = source / file_path.stem
+        if repo_source.exists():
+            if overwrite:
+                rm_tree(repo_source)
+            else:
+                return repo_source
+        with zipfile.ZipFile(file_path, 'r') as repo_zip:
+            repo_zip.extractall(repo_source)
+
+        return repo_source
+
+
+def remove_files(path: Path, keep: Set[str]):
+    """
+    Remove files
+    """
+
+    for p in path.iterdir():
+        if p.is_symlink():
+            p.unlink()
+            continue
+        if p.is_dir():
+            remove_files(p, keep)
+        else:
+            if p.suffix not in keep:
+                p.unlink()
+
+
+def batch(overwrite: bool = False):
+    with monit.section('Get pytorch_awesome'):
+        get_awesome_pytorch_readme()
+        repos = get_repos_from_readme('pytorch_awesome.md')
+
+    # Download zips
+    for i, r in monit.enum(f"Download {len(repos)} repos", repos):
+        download_repo(r[0], r[1], i)
+
+    # Extract downloads
+    with monit.section('Extract zips'):
+        download = Path(lab.get_data_path() / 'download')
+
+        for repo in download.iterdir():
+            extract_zip(repo, overwrite)
+
+    with monit.section('Remove non python files'):
+        remove_files(lab.get_data_path() / 'source', {'.py'})
+
+
+def progressive(overwrite: bool = False):
+    # Get repos
+    get_awesome_pytorch_readme()
+    repos = get_repos_from_readme('pytorch_awesome.md')
+
+    # Download zips
+    for i, r in monit.enum(f"Download {len(repos)} repos", repos):
+        zip_file = download_repo(r[0], r[1], i)
+        extracted = extract_zip(zip_file, overwrite)
+        remove_files(extracted, {'.py'})
+
+
 def main():
-    source_files = GetPythonFiles().files
+    try:
+        progressive()
+    except KeyboardInterrupt:
+        pass
+
+    source_files = get_python_files()
 
     np.random.shuffle(source_files)
 
 
@@ -6,24 +6,17 @@ This repo trains deep learning models on source code.
 
 1. Clone this repo
 2. Install requirements from `requirements.txt`
-3. Download Github repos by running `python_autocomplete/download.py`.
- It downloads all the repos mentioned in
- [PyTorch awesome list](https://github.com/bharathgs/Awesome-pytorch-list).
-4. Run `python_autocomplete/extract_downloads.py` to extract the downloaded zip files to `data/source`.
- You can directly copy any python code to `data/source` to train on them.
-5. Run `python_autocomplete/remove_non_source_files.py` to all files except `.py` files.
-6. Run `python_autocomplete/create_dataset.py` to collect all python files.
- The collected code will be written to `data/train.py` and, `data/eval.py`.
-7. Run `python_autocomplete/train.py` to train the model.
+3. Run `python_autocomplete/create_dataset.py`. 
+   * It collects repos mentioned in
+ [PyTorch awesome list](https://github.com/bharathgs/Awesome-pytorch-list)
+   * Downloads the zip files of the repos
+   * Extract the zips
+   * Remove non python files
+   * Collect all python code to `data/train.py` and, `data/eval.py`
+4. Run `python_autocomplete/train.py` to train the model.
  *Try changing hyper-parameters like model dimensions and number of layers*.
-8. Run `evaluate.py` to evaluate the model.
-9. Enjoy!
+5. Run `evaluate.py` to evaluate the model.
 
-If you have any questions please open an issue on Github.
-
-Feel free to add interesting repos with lots of Python code to `download.py`.
- Thank you.
- 
 <p align="center">
   <img src="/python-autocomplete.png?raw=true" width="100%" title="Screenshot">
 </p>