bpe evaluate

vpj · vpj · commit 876fde7d5852 · 2021-02-16T15:44:16.000+05:30
diff --git a/python_autocomplete/bpe.py b/python_autocomplete/bpe.py
@@ -26,11 +26,11 @@ def itos(self):
     def stoi(self):
         return self.bpe.bpe_stoi
 
-    def encode(self, data: str):
-        words = self.tokenizer.tokenize(data)
+    def encode(self, data: str, *, is_silent: bool = False):
+        words = self.tokenizer.tokenize(data, is_silent=is_silent)
 
         res = []
-        for w in monit.iterate('Encode words', words):
+        for w in monit.iterate('Encode words', words, is_silent=is_silent):
             res += self.bpe.encode(w)
 
         return res
@@ -131,7 +131,7 @@ def encode(self, word: str):
         if word in self.popular_words:
             return self.popular_words[word]
 
-        return self.encoder.encode([self.char_stoi[c] for c in word])
+        return self.encoder.encode([self.char_stoi[c] for c in word if c in self.char_stoi])
 
 
 class Tokenizer:
@@ -141,7 +141,7 @@ def collect_words(self, data: str):
     def get_words(self) -> Tuple[List[str], List[int]]:
         raise NotImplementedError
 
-    def tokenize(self, data: str) -> List[str]:
+    def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
         raise NotImplementedError
 
 
@@ -158,12 +158,12 @@ def add_word(self, word):
         else:
             self.words[word] += 1
 
-    def tokenize(self, data: str) -> List[str]:
+    def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
         last_idx = 0
         is_id = False
         res = []
 
-        for i, c in monit.enum('Collect words', data):
+        for i, c in monit.enum('Collect words', data, is_silent=is_silent):
             if c in ID_CHARS:
                 if not is_id:
                     if last_idx < i:
@@ -217,7 +217,7 @@ def collect_words(self, data):
     def get_words(self):
         return [self.data], [1]
 
-    def tokenize(self, data: str) -> List[str]:
+    def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
         return [data]
 
 
diff --git a/python_autocomplete/evaluate.py b/python_autocomplete/evaluate.py
@@ -9,18 +9,19 @@
 from labml import experiment, logger, lab, monit
 from labml.logger import Text, Style
 from labml.utils.pytorch import get_modules
+from labml_helpers.datasets.text import TextDataset
 from labml_helpers.module import Module
 from python_autocomplete.train import Configs, StateUpdater
 
 
 class Predictor:
-    def __init__(self, model: Module, stoi: Dict[str, int], itos: List[str], *,
+    def __init__(self, model: Module, text: TextDataset, *,
                  state_updater: StateUpdater,
                  is_token_by_token: bool):
+        text.is_silent = True
+        self.text = text
         self.is_token_by_token = is_token_by_token
         self.state_updater = state_updater
-        self.stoi = stoi
-        self.itos = itos
         self.model = model
 
         # For timing
@@ -29,10 +30,8 @@ def __init__(self, model: Module, stoi: Dict[str, int], itos: List[str], *,
         self.time_check = 0
 
     def _get_predictions(self, prompt: str, state: Any) -> Tuple[torch.Tensor, Any]:
-        prompt = prompt[-512:]
-        data = torch.tensor([[self.stoi[c]] for c in prompt if c in self.stoi],
-                            dtype=torch.long,
-                            device=self.model.device)
+        data = self.text.text_to_i(prompt)[-512:]
+        data = data.to(self.model.device).unsqueeze(-1)
 
         # Get predictions
         with torch.no_grad():
@@ -58,7 +57,7 @@ def get_probabilities(self, prompt: str, state: Any) -> Tuple[np.ndarray, Any]:
     def get_next_token(self, prompt: str, state: Any) -> Tuple[str, Any]:
         prediction, state = self.get_predictions(prompt, state)
         best = prediction.argmax(-1).squeeze().item()
-        return self.itos[best], state
+        return self.text.itos[best], state
 
     def get_start_state(self, prompt: str):
         assert prompt
@@ -152,10 +151,10 @@ def anomalies(predictor: Predictor, text: str):
             logs = [(f"{line_no: 4d}: ", Text.meta)]
         elif c == '\r':
             continue
-        elif c not in predictor.stoi:
+        elif c not in predictor.text.stoi:
             logs.append(c)
         else:
-            next_id = predictor.stoi[c]
+            next_id = predictor.text.stoi[c]
             prob = preds[next_id]
             if prob > 0.9:
                 logs.append((c, [Style.bold, Text.success, Style.underline]))
@@ -220,21 +219,22 @@ def get_predictor():
     # And for latest checkpoint
     # checkpoint = None
 
-    run_uuid = '41dc02106d1611eb9ab213fdf628e807' # bpe
+    run_uuid = '275e62e66dc711eb9d162f2ddfc33452' # bpe
     # run_uuid = 'c45857026a2811eba16c27c69839e51f'  # xl
     checkpoint = None
-    # run_uuid, checkpoint = experiment.load_bundle(
-    #     lab.get_path() / 'saved_checkpoint.tar.gz',
-    #     url='https://github.com/lab-ml/python_autocomplete/releases/download/0.0.4/transformer_checkpoint.tar.gz')
+    run_uuid, checkpoint = experiment.load_bundle(
+        lab.get_path() / 'saved_checkpoint.tar.gz',
+        url='https://github.com/lab-ml/python_autocomplete/releases/download/0.0.4/transformer_checkpoint.tar.gz')
 
     conf_dict = experiment.load_configs(run_uuid)
+    conf_dict['is_load_data'] = False
     experiment.configs(conf, conf_dict)
     experiment.add_pytorch_models(get_modules(conf))
     experiment.load(run_uuid, checkpoint)
 
     experiment.start()
     conf.model.eval()
-    return Predictor(conf.model, conf.stoi, conf.itos,
+    return Predictor(conf.model, conf.text,
                      state_updater=conf.state_updater,
                      is_token_by_token=conf.is_token_by_token)
 
diff --git a/python_autocomplete/train.py b/python_autocomplete/train.py
@@ -20,34 +20,48 @@
 
 
 class SourceCodeDataset(TextDataset):
-    def __init__(self, path: PurePath, tokenizer: Callable):
-        with monit.section("Load data"):
-            train = self.load(path / 'train.py')  # [:100000]
-            valid = self.load(path / 'valid.py')  # [:100000]
+    def __init__(self, path: PurePath, tokenizer: Callable, dont_load: bool):
+        if not dont_load:
+            with monit.section("Load data"):
+                train = self.load(path / 'train.py')  # [:100000]
+                valid = self.load(path / 'valid.py')  # [:100000]
+        else:
+            train = ''
+            valid = ''
 
-        from labml.utils.cache import cache_get
+        from labml.utils.cache import cache_get, cache_set
 
         super().__init__(path, tokenizer, train, valid, '',
                          n_tokens=cache_get('n_tokens'),
                          itos=cache_get('itos'),
                          stoi=cache_get('stoi'))
 
+        cache_set(f'n_tokens', self.n_tokens)
+        cache_set(f'itos', self.itos)
+        cache_set(f'stoi', self.stoi)
+
 
 class BPESourceCodeDataset(TextDataset):
     tokenizer: BPE
 
-    def __init__(self, path: PurePath, bpe: BPE):
-        with monit.section("Load data"):
-            train = self.load(path / 'train.py')  # [:100_000]
-            valid = self.load(path / 'valid.py')  # [:100_000]
+    def __init__(self, path: PurePath, bpe: BPE, dont_load: bool):
+        if not dont_load:
+            with monit.section("Load data"):
+                train = self.load(path / 'train.py')  # [:100_000]
+                valid = self.load(path / 'valid.py')  # [:100_000]
+        else:
+            train = ''
+            valid = ''
+
+        self.is_silent = False
 
         super().__init__(path, bpe, train, valid, '',
                          n_tokens=bpe.n_tokens,
                          itos=bpe.itos,
                          stoi=bpe.stoi)
 
     def text_to_i(self, text: str) -> torch.Tensor:
-        return torch.tensor(self.tokenizer.encode(text))
+        return torch.tensor(self.tokenizer.encode(text, is_silent=self.is_silent))
 
 
 class Configs(TrainValidConfigs):
@@ -80,10 +94,8 @@ class Configs(TrainValidConfigs):
     grad_norm_clip: float = 1.0
     is_token_by_token: bool = False
 
-    itos: List[str]
-    stoi: Dict[str, int]
-
     cache_name: str = ''
+    is_load_data: bool = True
 
     def init(self):
         tracker.set_queue("loss.*", 20, True)
@@ -129,10 +141,10 @@ def sample(self):
             data = data.to(self.device)
             output, new_state = self.model(data, state)
             output = output.argmax(dim=-1).squeeze(1)
-            prompt += '' + self.itos[output[-1]]
+            prompt += '' + self.text.itos[output[-1]]
             if self.is_token_by_token:
                 prompt = prompt[-1:]
-            log += [('' + self.itos[output[-1]], Text.value)]
+            log += [('' + self.text.itos[output[-1]], Text.value)]
             state = self.state_updater(state, new_state)
 
         logger.log(log)
@@ -177,20 +189,7 @@ def _loss_func(c: Configs):
 
 @option(Configs.n_tokens)
 def _n_tokens(c: Configs):
-    from labml.utils.cache import cache
-    return cache(f'n_tokens{c.cache_name}', lambda: c.text.n_tokens)
-
-
-@option(Configs.itos)
-def _itos(c: Configs):
-    from labml.utils.cache import cache
-    return cache(f'itos{c.cache_name}', lambda: c.text.itos)
-
-
-@option(Configs.stoi)
-def _stoi(c: Configs):
-    from labml.utils.cache import cache
-    return cache(f'stoi{c.cache_name}', lambda: c.text.stoi)
+    return c.text.n_tokens
 
 
 @option(Configs.model)
@@ -285,7 +284,7 @@ def character():
 
 @option(Configs.text)
 def source_code(c: Configs):
-    return SourceCodeDataset(lab.get_data_path(), c.tokenizer)
+    return SourceCodeDataset(lab.get_data_path(), c.tokenizer, c.is_load_data)
 
 
 @option(Configs.text)
@@ -301,7 +300,7 @@ def source_code_bpe(c: Configs):
         raise RuntimeError('BPE not cached')
 
     tokenizer = BPE(bpe_en_de, SourceCodeTokenizer())
-    return BPESourceCodeDataset(lab.get_data_path(), tokenizer)
+    return BPESourceCodeDataset(lab.get_data_path(), tokenizer, c.is_load_data)
 
 
 @option(Configs.train_loader)