graykode

(refactor) black style

...@@ -68,6 +68,7 @@ def main(args): ...@@ -68,6 +68,7 @@ def main(args):
68 ) 68 )
69 print(commit_message) 69 print(commit_message)
70 70
71 +
71 if __name__ == "__main__": 72 if __name__ == "__main__":
72 parser = argparse.ArgumentParser(description="Code to collect commits on github") 73 parser = argparse.ArgumentParser(description="Code to collect commits on github")
73 parser.add_argument( 74 parser.add_argument(
......
1 # Copyright 2020-present Tae Hwan Jung 1 # Copyright 2020-present Tae Hwan Jung
2 -# 2 +#
3 # Licensed under the Apache License, Version 2.0 (the "License"); 3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License. 4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at 5 # You may obtain a copy of the License at
6 -# 6 +#
7 # http://www.apache.org/licenses/LICENSE-2.0 7 # http://www.apache.org/licenses/LICENSE-2.0
8 -# 8 +#
9 # Unless required by applicable law or agreed to in writing, software 9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS, 10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -15,6 +15,6 @@ ...@@ -15,6 +15,6 @@
15 from .gitcommit import diff_parse, truncate 15 from .gitcommit import diff_parse, truncate
16 16
17 __all__ = [ 17 __all__ = [
18 - 'diff_parse',
19 - 'truncate',
20 -]
...\ No newline at end of file ...\ No newline at end of file
18 + "diff_parse",
19 + "truncate",
20 +]
......
...@@ -36,9 +36,11 @@ logging.basicConfig( ...@@ -36,9 +36,11 @@ logging.basicConfig(
36 level=logging.INFO, 36 level=logging.INFO,
37 ) 37 )
38 38
39 +
39 class PATCH(enum.Enum): 40 class PATCH(enum.Enum):
40 - PLUS=1 41 + PLUS = 1
41 - MINUS=2 42 + MINUS = 2
43 +
42 44
43 def truncate(tuple, max_length, value=0): 45 def truncate(tuple, max_length, value=0):
44 ls = [] 46 ls = []
...@@ -46,22 +48,20 @@ def truncate(tuple, max_length, value=0): ...@@ -46,22 +48,20 @@ def truncate(tuple, max_length, value=0):
46 if isinstance(t, int): 48 if isinstance(t, int):
47 t = [t] 49 t = [t]
48 ls.extend(t) 50 ls.extend(t)
49 - ls = ls[:max_length - 1] 51 + ls = ls[: max_length - 1]
50 ls.insert(0, value) 52 ls.insert(0, value)
51 if len(ls) < max_length: 53 if len(ls) < max_length:
52 ls.extend([0] * (max_length - len(ls))) 54 ls.extend([0] * (max_length - len(ls)))
53 assert len(ls) == max_length 55 assert len(ls) == max_length
54 return ls 56 return ls
55 57
58 +
56 def encode_line(tokenizer, line, patch): 59 def encode_line(tokenizer, line, patch):
57 - line = re.sub(r'[\u0100-\uFFFF\U00010000-\U0010FFFF]+', '', line).strip() 60 + line = re.sub(r"[\u0100-\uFFFF\U00010000-\U0010FFFF]+", "", line).strip()
58 tokens = tokenizer.tokenize(line) 61 tokens = tokenizer.tokenize(line)
59 tokens = tokenizer.convert_tokens_to_ids(tokens) 62 tokens = tokenizer.convert_tokens_to_ids(tokens)
60 - return ( 63 + return (tokens, [1] * len(tokens), len(tokens) * [patch.value])
61 - tokens, 64 +
62 - [1] * len(tokens),
63 - len(tokens) * [patch.value]
64 - )
65 65
66 def diff_parse(diff, tokenizer): 66 def diff_parse(diff, tokenizer):
67 chunks = [] 67 chunks = []
...@@ -78,6 +78,7 @@ def diff_parse(diff, tokenizer): ...@@ -78,6 +78,7 @@ def diff_parse(diff, tokenizer):
78 chunks.append(encode_line(tokenizer, change.line, PATCH.MINUS)) 78 chunks.append(encode_line(tokenizer, change.line, PATCH.MINUS))
79 return chunks 79 return chunks
80 80
81 +
81 def sha_parse(sha, tokenizer, max_length=1024): 82 def sha_parse(sha, tokenizer, max_length=1024):
82 83
83 chunks = diff_parse(diff=repo.git.show(sha), tokenizer=tokenizer) 84 chunks = diff_parse(diff=repo.git.show(sha), tokenizer=tokenizer)
...@@ -91,16 +92,18 @@ def sha_parse(sha, tokenizer, max_length=1024): ...@@ -91,16 +92,18 @@ def sha_parse(sha, tokenizer, max_length=1024):
91 92
92 return (input_ids, attention_masks, patch_ids) 93 return (input_ids, attention_masks, patch_ids)
93 94
95 +
94 def message_parse(msg, tokenizer, max_length=56): 96 def message_parse(msg, tokenizer, max_length=56):
95 - msg = re.sub(r'(\(|)#([0-9])+(\)|)', '', msg) 97 + msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg)
96 98
97 - msg = re.sub(r'[\u0100-\uFFFF\U00010000-\U0010FFFF]+', '', msg).strip() 99 + msg = re.sub(r"[\u0100-\uFFFF\U00010000-\U0010FFFF]+", "", msg).strip()
98 msg = tokenizer.tokenize(msg) 100 msg = tokenizer.tokenize(msg)
99 msg = tokenizer.convert_tokens_to_ids(msg) 101 msg = tokenizer.convert_tokens_to_ids(msg)
100 msg = truncate(msg, max_length, value=0) 102 msg = truncate(msg, max_length, value=0)
101 103
102 return msg 104 return msg
103 105
106 +
104 def jobs(sha_msgs, args, data_config, train=True): 107 def jobs(sha_msgs, args, data_config, train=True):
105 108
106 input_ids, attention_masks, patch_ids, targets = [], [], [], [] 109 input_ids, attention_masks, patch_ids, targets = [], [], [], []
...@@ -110,9 +113,7 @@ def jobs(sha_msgs, args, data_config, train=True): ...@@ -110,9 +113,7 @@ def jobs(sha_msgs, args, data_config, train=True):
110 sha, msg = sha_msg 113 sha, msg = sha_msg
111 114
112 source = sha_parse( 115 source = sha_parse(
113 - sha, 116 + sha, tokenizer=args.tokenizer, max_length=args.max_source_length
114 - tokenizer=args.tokenizer,
115 - max_length=args.max_source_length
116 ) 117 )
117 if not source: 118 if not source:
118 continue 119 continue
...@@ -120,7 +121,9 @@ def jobs(sha_msgs, args, data_config, train=True): ...@@ -120,7 +121,9 @@ def jobs(sha_msgs, args, data_config, train=True):
120 target = message_parse( 121 target = message_parse(
121 msg, 122 msg,
122 tokenizer=args.tokenizer, 123 tokenizer=args.tokenizer,
123 - max_length=(args.max_target_length if train else args.val_max_target_length), 124 + max_length=(
125 + args.max_target_length if train else args.val_max_target_length
126 + ),
124 ) 127 )
125 128
126 input_ids.append(input_id) 129 input_ids.append(input_id)
...@@ -128,14 +131,17 @@ def jobs(sha_msgs, args, data_config, train=True): ...@@ -128,14 +131,17 @@ def jobs(sha_msgs, args, data_config, train=True):
128 patch_ids.append(patch_id) 131 patch_ids.append(patch_id)
129 targets.append(target) 132 targets.append(target)
130 133
131 - data_saver({ 134 + data_saver(
132 - "input_ids": np.asarray(input_ids), 135 + {
133 - "attention_masks": np.asarray(attention_masks), 136 + "input_ids": np.asarray(input_ids),
134 - "patch_ids": np.asarray(patch_ids), 137 + "attention_masks": np.asarray(attention_masks),
135 - "targets": np.asarray(targets), 138 + "patch_ids": np.asarray(patch_ids),
136 - }) 139 + "targets": np.asarray(targets),
140 + }
141 + )
137 data_saver.disconnect() 142 data_saver.disconnect()
138 143
144 +
139 def start(chunked_sha_msgs, train=True): 145 def start(chunked_sha_msgs, train=True):
140 146
141 logger.info(f"Start %s pre-processing" % ("training" if train else "evaluation")) 147 logger.info(f"Start %s pre-processing" % ("training" if train else "evaluation"))
...@@ -144,22 +150,22 @@ def start(chunked_sha_msgs, train=True): ...@@ -144,22 +150,22 @@ def start(chunked_sha_msgs, train=True):
144 150
145 data_config = DataConfig( 151 data_config = DataConfig(
146 endpoint=args.endpoint, 152 endpoint=args.endpoint,
147 - access_key=os.environ['access_key'], 153 + access_key=os.environ["access_key"],
148 - secret_key=os.environ['secret_key'], 154 + secret_key=os.environ["secret_key"],
149 region=args.region, 155 region=args.region,
150 - dataset_name='commit-autosuggestions', 156 + dataset_name="commit-autosuggestions",
151 additional={ 157 additional={
152 - "mode" : ("training" if train else "evaluation"), 158 + "mode": ("training" if train else "evaluation"),
153 "max_source_length": args.max_source_length, 159 "max_source_length": args.max_source_length,
154 "max_target_length": max_target_length, 160 "max_target_length": max_target_length,
155 - "url" : args.url, 161 + "url": args.url,
156 }, 162 },
157 attributes=[ 163 attributes=[
158 - ('input_ids', 'int32', (args.max_source_length,)), 164 + ("input_ids", "int32", (args.max_source_length,)),
159 - ('attention_masks', 'int32', (args.max_source_length,)), 165 + ("attention_masks", "int32", (args.max_source_length,)),
160 - ('patch_ids', 'int32', (args.max_source_length,)), 166 + ("patch_ids", "int32", (args.max_source_length,)),
161 - ('targets', 'int32', (max_target_length,)) 167 + ("targets", "int32", (max_target_length,)),
162 - ] 168 + ],
163 ) 169 )
164 170
165 func = partial(jobs, args=args, data_config=data_config, train=train) 171 func = partial(jobs, args=args, data_config=data_config, train=train)
...@@ -168,14 +174,15 @@ def start(chunked_sha_msgs, train=True): ...@@ -168,14 +174,15 @@ def start(chunked_sha_msgs, train=True):
168 for i, _ in tqdm(enumerate(pool.imap_unordered(func, chunked_sha_msgs))): 174 for i, _ in tqdm(enumerate(pool.imap_unordered(func, chunked_sha_msgs))):
169 pbar.update() 175 pbar.update()
170 176
177 +
171 def main(args): 178 def main(args):
172 - if 'access_key' not in os.environ or 'secret_key' not in os.environ: 179 + if "access_key" not in os.environ or "secret_key" not in os.environ:
173 raise OSError("access_key or secret_key are not found.") 180 raise OSError("access_key or secret_key are not found.")
174 181
175 sha_msgs = [(c.hexsha, c.summary) for c in repo.iter_commits()] 182 sha_msgs = [(c.hexsha, c.summary) for c in repo.iter_commits()]
176 random.shuffle(sha_msgs) 183 random.shuffle(sha_msgs)
177 chunked_sha_msgs = [ 184 chunked_sha_msgs = [
178 - sha_msgs[x:x + args.matorage_batch] 185 + sha_msgs[x : x + args.matorage_batch]
179 for x in range(0, len(sha_msgs), args.matorage_batch) 186 for x in range(0, len(sha_msgs), args.matorage_batch)
180 ] 187 ]
181 188
...@@ -185,29 +192,25 @@ def main(args): ...@@ -185,29 +192,25 @@ def main(args):
185 if args.do_predict: 192 if args.do_predict:
186 start(chunked_sha_msgs[barrier:], train=False) 193 start(chunked_sha_msgs[barrier:], train=False)
187 194
195 +
188 if __name__ == "__main__": 196 if __name__ == "__main__":
189 parser = argparse.ArgumentParser(description="Code to collect commits on github") 197 parser = argparse.ArgumentParser(description="Code to collect commits on github")
190 - parser.add_argument( 198 + parser.add_argument("--url", type=str, required=True, help="github url")
191 - "--url",
192 - type=str,
193 - required=True,
194 - help="github url"
195 - )
196 parser.add_argument( 199 parser.add_argument(
197 "--endpoint", 200 "--endpoint",
198 type=str, 201 type=str,
199 required=True, 202 required=True,
200 - help='matorage endpoint, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html' 203 + help="matorage endpoint, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
201 ) 204 )
202 parser.add_argument( 205 parser.add_argument(
203 "--region", 206 "--region",
204 type=str, 207 type=str,
205 default=None, 208 default=None,
206 - help='matorage s3 region, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html' 209 + help="matorage s3 region, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
207 ) 210 )
208 parser.add_argument( 211 parser.add_argument(
209 "--tokenizer_name", 212 "--tokenizer_name",
210 - default='sshleifer/distilbart-xsum-6-6', 213 + default="sshleifer/distilbart-xsum-6-6",
211 type=str, 214 type=str,
212 help="Pretrained tokenizer name or path if not the same as model_name", 215 help="Pretrained tokenizer name or path if not the same as model_name",
213 ) 216 )
...@@ -215,41 +218,40 @@ if __name__ == "__main__": ...@@ -215,41 +218,40 @@ if __name__ == "__main__":
215 "--matorage_batch", 218 "--matorage_batch",
216 default=1024, 219 default=1024,
217 type=int, 220 type=int,
218 - help='The smallest batch size stored atomically in matorage.' 221 + help="The smallest batch size stored atomically in matorage.",
219 ) 222 )
220 parser.add_argument( 223 parser.add_argument(
221 - "--num_workers", 224 + "--num_workers", default=4, type=int, help="number of process",
222 - default=4,
223 - type=int,
224 - help="number of process",
225 ) 225 )
226 parser.add_argument( 226 parser.add_argument(
227 "--max_source_length", 227 "--max_source_length",
228 default=1024, 228 default=1024,
229 type=int, 229 type=int,
230 help="The maximum total input sequence length after tokenization. Sequences longer " 230 help="The maximum total input sequence length after tokenization. Sequences longer "
231 - "than this will be truncated, sequences shorter will be padded.", 231 + "than this will be truncated, sequences shorter will be padded.",
232 ) 232 )
233 parser.add_argument( 233 parser.add_argument(
234 "--max_target_length", 234 "--max_target_length",
235 default=56, 235 default=56,
236 type=int, 236 type=int,
237 help="The maximum total input sequence length after tokenization. Sequences longer " 237 help="The maximum total input sequence length after tokenization. Sequences longer "
238 - "than this will be truncated, sequences shorter will be padded.", 238 + "than this will be truncated, sequences shorter will be padded.",
239 ) 239 )
240 parser.add_argument( 240 parser.add_argument(
241 "--val_max_target_length", 241 "--val_max_target_length",
242 default=142, # these defaults are optimized for CNNDM. For xsum, see README.md. 242 default=142, # these defaults are optimized for CNNDM. For xsum, see README.md.
243 type=int, 243 type=int,
244 help="The maximum total input sequence length after tokenization. Sequences longer " 244 help="The maximum total input sequence length after tokenization. Sequences longer "
245 - "than this will be truncated, sequences shorter will be padded.", 245 + "than this will be truncated, sequences shorter will be padded.",
246 + )
247 + parser.add_argument(
248 + "--p_val", type=float, default=0.25, help="percent of validation dataset"
246 ) 249 )
247 - parser.add_argument("--p_val", type=float, default=0.25, help="percent of validation dataset")
248 parser.add_argument("--do_train", action="store_true", default=False) 250 parser.add_argument("--do_train", action="store_true", default=False)
249 parser.add_argument("--do_predict", action="store_true", default=False) 251 parser.add_argument("--do_predict", action="store_true", default=False)
250 args = parser.parse_args() 252 args = parser.parse_args()
251 253
252 - args.local_path = args.url.split('/')[-1] 254 + args.local_path = args.url.split("/")[-1]
253 logger.info(f"master branch of {args.url} will be downloaded to {args.local_path}") 255 logger.info(f"master branch of {args.url} will be downloaded to {args.local_path}")
254 repo = ( 256 repo = (
255 Repo(args.local_path) 257 Repo(args.local_path)
......
1 # Copyright 2020-present Tae Hwan Jung 1 # Copyright 2020-present Tae Hwan Jung
2 -# 2 +#
3 # Licensed under the Apache License, Version 2.0 (the "License"); 3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License. 4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at 5 # You may obtain a copy of the License at
6 -# 6 +#
7 # http://www.apache.org/licenses/LICENSE-2.0 7 # http://www.apache.org/licenses/LICENSE-2.0
8 -# 8 +#
9 # Unless required by applicable law or agreed to in writing, software 9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS, 10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -14,6 +14,4 @@ ...@@ -14,6 +14,4 @@
14 14
15 from .modeling_bart import BartForConditionalGeneration 15 from .modeling_bart import BartForConditionalGeneration
16 16
17 -__all__ = [
18 - 'BartForConditionalGeneration'
19 -]
...\ No newline at end of file ...\ No newline at end of file
17 +__all__ = ["BartForConditionalGeneration"]
......
...@@ -20,16 +20,31 @@ logger = logging.getLogger(__name__) ...@@ -20,16 +20,31 @@ logger = logging.getLogger(__name__)
20 20
21 class Seq2SeqLoggingCallback(pl.Callback): 21 class Seq2SeqLoggingCallback(pl.Callback):
22 def on_batch_end(self, trainer, pl_module): 22 def on_batch_end(self, trainer, pl_module):
23 - lrs = {f"lr_group_{i}": param["lr"] for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)} 23 + lrs = {
24 + f"lr_group_{i}": param["lr"]
25 + for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)
26 + }
24 pl_module.logger.log_metrics(lrs) 27 pl_module.logger.log_metrics(lrs)
25 28
26 @rank_zero_only 29 @rank_zero_only
27 def _write_logs( 30 def _write_logs(
28 - self, trainer: pl.Trainer, pl_module: pl.LightningModule, type_path: str, save_generations=True 31 + self,
32 + trainer: pl.Trainer,
33 + pl_module: pl.LightningModule,
34 + type_path: str,
35 + save_generations=True,
29 ) -> None: 36 ) -> None:
30 - logger.info(f"***** {type_path} results at step {trainer.global_step:05d} *****") 37 + logger.info(
38 + f"***** {type_path} results at step {trainer.global_step:05d} *****"
39 + )
31 metrics = trainer.callback_metrics 40 metrics = trainer.callback_metrics
32 - trainer.logger.log_metrics({k: v for k, v in metrics.items() if k not in ["log", "progress_bar", "preds"]}) 41 + trainer.logger.log_metrics(
42 + {
43 + k: v
44 + for k, v in metrics.items()
45 + if k not in ["log", "progress_bar", "preds"]
46 + }
47 + )
33 # Log results 48 # Log results
34 od = Path(pl_module.hparams.output_dir) 49 od = Path(pl_module.hparams.output_dir)
35 if type_path == "test": 50 if type_path == "test":
...@@ -39,7 +54,9 @@ class Seq2SeqLoggingCallback(pl.Callback): ...@@ -39,7 +54,9 @@ class Seq2SeqLoggingCallback(pl.Callback):
39 # this never gets hit. I prefer not to save intermediate generations, and results are in metrics.json 54 # this never gets hit. I prefer not to save intermediate generations, and results are in metrics.json
40 # If people want this it will be easy enough to add back. 55 # If people want this it will be easy enough to add back.
41 results_file = od / f"{type_path}_results/{trainer.global_step:05d}.txt" 56 results_file = od / f"{type_path}_results/{trainer.global_step:05d}.txt"
42 - generations_file = od / f"{type_path}_generations/{trainer.global_step:05d}.txt" 57 + generations_file = (
58 + od / f"{type_path}_generations/{trainer.global_step:05d}.txt"
59 + )
43 results_file.parent.mkdir(exist_ok=True) 60 results_file.parent.mkdir(exist_ok=True)
44 generations_file.parent.mkdir(exist_ok=True) 61 generations_file.parent.mkdir(exist_ok=True)
45 with open(results_file, "a+") as writer: 62 with open(results_file, "a+") as writer:
...@@ -68,7 +85,9 @@ class Seq2SeqLoggingCallback(pl.Callback): ...@@ -68,7 +85,9 @@ class Seq2SeqLoggingCallback(pl.Callback):
68 85
69 n_trainable_pars = count_trainable_parameters(pl_module) 86 n_trainable_pars = count_trainable_parameters(pl_module)
70 # mp stands for million parameters 87 # mp stands for million parameters
71 - trainer.logger.log_metrics({"n_params": npars, "mp": npars / 1e6, "grad_mp": n_trainable_pars / 1e6}) 88 + trainer.logger.log_metrics(
89 + {"n_params": npars, "mp": npars / 1e6, "grad_mp": n_trainable_pars / 1e6}
90 + )
72 91
73 @rank_zero_only 92 @rank_zero_only
74 def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule): 93 def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
...@@ -98,8 +117,5 @@ def get_checkpoint_callback(output_dir, metric): ...@@ -98,8 +117,5 @@ def get_checkpoint_callback(output_dir, metric):
98 117
99 def get_early_stopping_callback(metric, patience): 118 def get_early_stopping_callback(metric, patience):
100 return EarlyStopping( 119 return EarlyStopping(
101 - monitor=f"val_{metric}", 120 + monitor=f"val_{metric}", mode="max", patience=patience, verbose=True,
102 - mode="max",
103 - patience=patience,
104 - verbose=True,
105 ) 121 )
......
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
...@@ -69,7 +69,7 @@ class BaseTransformer(pl.LightningModule): ...@@ -69,7 +69,7 @@ class BaseTransformer(pl.LightningModule):
69 config=None, 69 config=None,
70 tokenizer=None, 70 tokenizer=None,
71 model=None, 71 model=None,
72 - **config_kwargs 72 + **config_kwargs,
73 ): 73 ):
74 """Initialize a model, tokenizer and config.""" 74 """Initialize a model, tokenizer and config."""
75 super().__init__() 75 super().__init__()
...@@ -83,7 +83,9 @@ class BaseTransformer(pl.LightningModule): ...@@ -83,7 +83,9 @@ class BaseTransformer(pl.LightningModule):
83 cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None 83 cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
84 if config is None: 84 if config is None:
85 self.config = AutoConfig.from_pretrained( 85 self.config = AutoConfig.from_pretrained(
86 - self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path, 86 + self.hparams.config_name
87 + if self.hparams.config_name
88 + else self.hparams.model_name_or_path,
87 **({"num_labels": num_labels} if num_labels is not None else {}), 89 **({"num_labels": num_labels} if num_labels is not None else {}),
88 cache_dir=cache_dir, 90 cache_dir=cache_dir,
89 **config_kwargs, 91 **config_kwargs,
...@@ -91,15 +93,24 @@ class BaseTransformer(pl.LightningModule): ...@@ -91,15 +93,24 @@ class BaseTransformer(pl.LightningModule):
91 else: 93 else:
92 self.config: PretrainedConfig = config 94 self.config: PretrainedConfig = config
93 95
94 - extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") 96 + extra_model_params = (
97 + "encoder_layerdrop",
98 + "decoder_layerdrop",
99 + "dropout",
100 + "attention_dropout",
101 + )
95 for p in extra_model_params: 102 for p in extra_model_params:
96 if getattr(self.hparams, p, None): 103 if getattr(self.hparams, p, None):
97 - assert hasattr(self.config, p), f"model config doesn't have a `{p}` attribute" 104 + assert hasattr(
105 + self.config, p
106 + ), f"model config doesn't have a `{p}` attribute"
98 setattr(self.config, p, getattr(self.hparams, p)) 107 setattr(self.config, p, getattr(self.hparams, p))
99 108
100 if tokenizer is None: 109 if tokenizer is None:
101 self.tokenizer = AutoTokenizer.from_pretrained( 110 self.tokenizer = AutoTokenizer.from_pretrained(
102 - self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path, 111 + self.hparams.tokenizer_name
112 + if self.hparams.tokenizer_name
113 + else self.hparams.model_name_or_path,
103 cache_dir=cache_dir, 114 cache_dir=cache_dir,
104 ) 115 )
105 else: 116 else:
...@@ -121,7 +132,9 @@ class BaseTransformer(pl.LightningModule): ...@@ -121,7 +132,9 @@ class BaseTransformer(pl.LightningModule):
121 def get_lr_scheduler(self): 132 def get_lr_scheduler(self):
122 get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler] 133 get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
123 scheduler = get_schedule_func( 134 scheduler = get_schedule_func(
124 - self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps 135 + self.opt,
136 + num_warmup_steps=self.hparams.warmup_steps,
137 + num_training_steps=self.total_steps,
125 ) 138 )
126 scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1} 139 scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
127 return scheduler 140 return scheduler
...@@ -132,22 +145,35 @@ class BaseTransformer(pl.LightningModule): ...@@ -132,22 +145,35 @@ class BaseTransformer(pl.LightningModule):
132 no_decay = ["bias", "LayerNorm.weight"] 145 no_decay = ["bias", "LayerNorm.weight"]
133 optimizer_grouped_parameters = [ 146 optimizer_grouped_parameters = [
134 { 147 {
135 - "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 148 + "params": [
149 + p
150 + for n, p in model.named_parameters()
151 + if not any(nd in n for nd in no_decay)
152 + ],
136 "weight_decay": self.hparams.weight_decay, 153 "weight_decay": self.hparams.weight_decay,
137 }, 154 },
138 { 155 {
139 - "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 156 + "params": [
157 + p
158 + for n, p in model.named_parameters()
159 + if any(nd in n for nd in no_decay)
160 + ],
140 "weight_decay": 0.0, 161 "weight_decay": 0.0,
141 }, 162 },
142 ] 163 ]
143 if self.hparams.adafactor: 164 if self.hparams.adafactor:
144 optimizer = Adafactor( 165 optimizer = Adafactor(
145 - optimizer_grouped_parameters, lr=self.hparams.learning_rate, scale_parameter=False, relative_step=False 166 + optimizer_grouped_parameters,
167 + lr=self.hparams.learning_rate,
168 + scale_parameter=False,
169 + relative_step=False,
146 ) 170 )
147 171
148 else: 172 else:
149 optimizer = AdamW( 173 optimizer = AdamW(
150 - optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon 174 + optimizer_grouped_parameters,
175 + lr=self.hparams.learning_rate,
176 + eps=self.hparams.adam_epsilon,
151 ) 177 )
152 self.opt = optimizer 178 self.opt = optimizer
153 179
...@@ -165,13 +191,19 @@ class BaseTransformer(pl.LightningModule): ...@@ -165,13 +191,19 @@ class BaseTransformer(pl.LightningModule):
165 def total_steps(self) -> int: 191 def total_steps(self) -> int:
166 """The number of total training steps that will be run. Used for lr scheduler purposes.""" 192 """The number of total training steps that will be run. Used for lr scheduler purposes."""
167 num_devices = max(1, self.hparams.gpus) # TODO: consider num_tpu_cores 193 num_devices = max(1, self.hparams.gpus) # TODO: consider num_tpu_cores
168 - effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices 194 + effective_batch_size = (
195 + self.hparams.train_batch_size
196 + * self.hparams.accumulate_grad_batches
197 + * num_devices
198 + )
169 dataset_size = len(self.train_loader.dataset) 199 dataset_size = len(self.train_loader.dataset)
170 return (dataset_size / effective_batch_size) * self.hparams.max_epochs 200 return (dataset_size / effective_batch_size) * self.hparams.max_epochs
171 201
172 def setup(self, mode): 202 def setup(self, mode):
173 if mode == "fit": 203 if mode == "fit":
174 - self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True) 204 + self.train_loader = self.get_dataloader(
205 + "train", self.hparams.train_batch_size, shuffle=True
206 + )
175 207
176 def get_dataloader(self, type_path, batch_size, shuffle=False): 208 def get_dataloader(self, type_path, batch_size, shuffle=False):
177 raise NotImplementedError("You must implement this for your task") 209 raise NotImplementedError("You must implement this for your task")
...@@ -212,7 +244,10 @@ class BaseTransformer(pl.LightningModule): ...@@ -212,7 +244,10 @@ class BaseTransformer(pl.LightningModule):
212 help="Path to pretrained model or model identifier from huggingface.co/models", 244 help="Path to pretrained model or model identifier from huggingface.co/models",
213 ) 245 )
214 parser.add_argument( 246 parser.add_argument(
215 - "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" 247 + "--config_name",
248 + default="",
249 + type=str,
250 + help="Pretrained config name or path if not the same as model_name",
216 ) 251 )
217 parser.add_argument( 252 parser.add_argument(
218 "--tokenizer_name", 253 "--tokenizer_name",
...@@ -246,7 +281,12 @@ class BaseTransformer(pl.LightningModule): ...@@ -246,7 +281,12 @@ class BaseTransformer(pl.LightningModule):
246 type=float, 281 type=float,
247 help="Attention dropout probability (Optional). Goes into model.config", 282 help="Attention dropout probability (Optional). Goes into model.config",
248 ) 283 )
249 - parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") 284 + parser.add_argument(
285 + "--learning_rate",
286 + default=5e-5,
287 + type=float,
288 + help="The initial learning rate for Adam.",
289 + )
250 parser.add_argument( 290 parser.add_argument(
251 "--lr_scheduler", 291 "--lr_scheduler",
252 default="linear", 292 default="linear",
...@@ -255,11 +295,30 @@ class BaseTransformer(pl.LightningModule): ...@@ -255,11 +295,30 @@ class BaseTransformer(pl.LightningModule):
255 type=str, 295 type=str,
256 help="Learning rate scheduler", 296 help="Learning rate scheduler",
257 ) 297 )
258 - parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") 298 + parser.add_argument(
259 - parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") 299 + "--weight_decay",
260 - parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") 300 + default=0.0,
261 - parser.add_argument("--num_workers", default=4, type=int, help="kwarg passed to DataLoader") 301 + type=float,
262 - parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int) 302 + help="Weight decay if we apply some.",
303 + )
304 + parser.add_argument(
305 + "--adam_epsilon",
306 + default=1e-8,
307 + type=float,
308 + help="Epsilon for Adam optimizer.",
309 + )
310 + parser.add_argument(
311 + "--warmup_steps",
312 + default=0,
313 + type=int,
314 + help="Linear warmup over warmup_steps.",
315 + )
316 + parser.add_argument(
317 + "--num_workers", default=4, type=int, help="kwarg passed to DataLoader"
318 + )
319 + parser.add_argument(
320 + "--num_train_epochs", dest="max_epochs", default=3, type=int
321 + )
263 parser.add_argument("--train_batch_size", default=32, type=int) 322 parser.add_argument("--train_batch_size", default=32, type=int)
264 parser.add_argument("--eval_batch_size", default=32, type=int) 323 parser.add_argument("--eval_batch_size", default=32, type=int)
265 parser.add_argument("--adafactor", action="store_true") 324 parser.add_argument("--adafactor", action="store_true")
...@@ -283,7 +342,9 @@ class LoggingCallback(pl.Callback): ...@@ -283,7 +342,9 @@ class LoggingCallback(pl.Callback):
283 rank_zero_info("***** Test results *****") 342 rank_zero_info("***** Test results *****")
284 metrics = trainer.callback_metrics 343 metrics = trainer.callback_metrics
285 # Log and save results to file 344 # Log and save results to file
286 - output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt") 345 + output_test_results_file = os.path.join(
346 + pl_module.hparams.output_dir, "test_results.txt"
347 + )
287 with open(output_test_results_file, "w") as writer: 348 with open(output_test_results_file, "w") as writer:
288 for key in sorted(metrics): 349 for key in sorted(metrics):
289 if key not in ["log", "progress_bar"]: 350 if key not in ["log", "progress_bar"]:
...@@ -314,9 +375,21 @@ def add_generic_args(parser, root_dir) -> None: ...@@ -314,9 +375,21 @@ def add_generic_args(parser, root_dir) -> None:
314 "See details at https://nvidia.github.io/apex/amp.html", 375 "See details at https://nvidia.github.io/apex/amp.html",
315 ) 376 )
316 parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int) 377 parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int)
317 - parser.add_argument("--max_grad_norm", dest="gradient_clip_val", default=1.0, type=float, help="Max gradient norm") 378 + parser.add_argument(
318 - parser.add_argument("--do_train", action="store_true", help="Whether to run training.") 379 + "--max_grad_norm",
319 - parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") 380 + dest="gradient_clip_val",
381 + default=1.0,
382 + type=float,
383 + help="Max gradient norm",
384 + )
385 + parser.add_argument(
386 + "--do_train", action="store_true", help="Whether to run training."
387 + )
388 + parser.add_argument(
389 + "--do_predict",
390 + action="store_true",
391 + help="Whether to run predictions on the test set.",
392 + )
320 parser.add_argument( 393 parser.add_argument(
321 "--gradient_accumulation_steps", 394 "--gradient_accumulation_steps",
322 dest="accumulate_grad_batches", 395 dest="accumulate_grad_batches",
...@@ -324,7 +397,9 @@ def add_generic_args(parser, root_dir) -> None: ...@@ -324,7 +397,9 @@ def add_generic_args(parser, root_dir) -> None:
324 default=1, 397 default=1,
325 help="Number of updates steps to accumulate before performing a backward/update pass.", 398 help="Number of updates steps to accumulate before performing a backward/update pass.",
326 ) 399 )
327 - parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") 400 + parser.add_argument(
401 + "--seed", type=int, default=42, help="random seed for initialization"
402 + )
328 403
329 404
330 def generic_train( 405 def generic_train(
...@@ -335,7 +410,7 @@ def generic_train( ...@@ -335,7 +410,7 @@ def generic_train(
335 extra_callbacks=[], 410 extra_callbacks=[],
336 checkpoint_callback=None, 411 checkpoint_callback=None,
337 logging_callback=None, 412 logging_callback=None,
338 - **extra_train_kwargs 413 + **extra_train_kwargs,
339 ): 414 ):
340 pl.seed_everything(args.seed) 415 pl.seed_everything(args.seed)
341 416
...@@ -346,7 +421,11 @@ def generic_train( ...@@ -346,7 +421,11 @@ def generic_train(
346 # add custom checkpoints 421 # add custom checkpoints
347 if checkpoint_callback is None: 422 if checkpoint_callback is None:
348 checkpoint_callback = pl.callbacks.ModelCheckpoint( 423 checkpoint_callback = pl.callbacks.ModelCheckpoint(
349 - filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1 424 + filepath=args.output_dir,
425 + prefix="checkpoint",
426 + monitor="val_loss",
427 + mode="min",
428 + save_top_k=1,
350 ) 429 )
351 if logging_callback is None: 430 if logging_callback is None:
352 logging_callback = LoggingCallback() 431 logging_callback = LoggingCallback()
......
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
...@@ -39,9 +39,13 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100): ...@@ -39,9 +39,13 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):
39 return loss, nll_loss 39 return loss, nll_loss
40 40
41 41
42 -def encode_line(tokenizer, line, max_length, pad_to_max_length=True, return_tensors="pt"): 42 +def encode_line(
43 + tokenizer, line, max_length, pad_to_max_length=True, return_tensors="pt"
44 +):
43 """Only used by LegacyDataset""" 45 """Only used by LegacyDataset"""
44 - extra_kw = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {} 46 + extra_kw = (
47 + {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
48 + )
45 return tokenizer( 49 return tokenizer(
46 [line], 50 [line],
47 max_length=max_length, 51 max_length=max_length,
...@@ -63,9 +67,7 @@ def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict: ...@@ -63,9 +67,7 @@ def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
63 67
64 68
65 def trim_batch( 69 def trim_batch(
66 - input_ids, 70 + input_ids, pad_token_id, attention_mask=None,
67 - pad_token_id,
68 - attention_mask=None,
69 ): 71 ):
70 """Remove columns that are populated exclusively by pad_token_id""" 72 """Remove columns that are populated exclusively by pad_token_id"""
71 keep_column_mask = input_ids.ne(pad_token_id).any(dim=0) 73 keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
...@@ -125,7 +127,9 @@ class LegacySeq2SeqDataset(AbstractSeq2SeqDataset): ...@@ -125,7 +127,9 @@ class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
125 def __getitem__(self, index) -> Dict[str, torch.Tensor]: 127 def __getitem__(self, index) -> Dict[str, torch.Tensor]:
126 """Call tokenizer on src and tgt_lines""" 128 """Call tokenizer on src and tgt_lines"""
127 index = index + 1 # linecache starts at 1 129 index = index + 1 # linecache starts at 1
128 - source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n") 130 + source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip(
131 + "\n"
132 + )
129 tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n") 133 tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
130 assert source_line, f"empty source line for index {index}" 134 assert source_line, f"empty source line for index {index}"
131 assert tgt_line, f"empty tgt line for index {index}" 135 assert tgt_line, f"empty tgt line for index {index}"
...@@ -147,7 +151,9 @@ class LegacySeq2SeqDataset(AbstractSeq2SeqDataset): ...@@ -147,7 +151,9 @@ class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
147 target_ids = torch.stack([x["labels"] for x in batch]) 151 target_ids = torch.stack([x["labels"] for x in batch])
148 pad_token_id = self.pad_token_id 152 pad_token_id = self.pad_token_id
149 y = trim_batch(target_ids, pad_token_id) 153 y = trim_batch(target_ids, pad_token_id)
150 - source_ids, source_mask = trim_batch(input_ids, pad_token_id, attention_mask=masks) 154 + source_ids, source_mask = trim_batch(
155 + input_ids, pad_token_id, attention_mask=masks
156 + )
151 batch = { 157 batch = {
152 "input_ids": source_ids, 158 "input_ids": source_ids,
153 "attention_mask": source_mask, 159 "attention_mask": source_mask,
...@@ -161,7 +167,9 @@ class Seq2SeqDataset(AbstractSeq2SeqDataset): ...@@ -161,7 +167,9 @@ class Seq2SeqDataset(AbstractSeq2SeqDataset):
161 167
162 def __getitem__(self, index) -> Dict[str, str]: 168 def __getitem__(self, index) -> Dict[str, str]:
163 index = index + 1 # linecache starts at 1 169 index = index + 1 # linecache starts at 1
164 - source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n") 170 + source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip(
171 + "\n"
172 + )
165 tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n") 173 tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
166 assert source_line, f"empty source line for index {index}" 174 assert source_line, f"empty source line for index {index}"
167 assert tgt_line, f"empty tgt line for index {index}" 175 assert tgt_line, f"empty tgt line for index {index}"
...@@ -201,12 +209,23 @@ class SortishSampler(Sampler): ...@@ -201,12 +209,23 @@ class SortishSampler(Sampler):
201 idxs = np.random.permutation(len(self.data)) 209 idxs = np.random.permutation(len(self.data))
202 sz = self.bs * 50 210 sz = self.bs * 50
203 ck_idx = [idxs[i : i + sz] for i in range(0, len(idxs), sz)] 211 ck_idx = [idxs[i : i + sz] for i in range(0, len(idxs), sz)]
204 - sort_idx = np.concatenate([sorted(s, key=self.key, reverse=True) for s in ck_idx]) 212 + sort_idx = np.concatenate(
213 + [sorted(s, key=self.key, reverse=True) for s in ck_idx]
214 + )
205 sz = self.bs 215 sz = self.bs
206 ck_idx = [sort_idx[i : i + sz] for i in range(0, len(sort_idx), sz)] 216 ck_idx = [sort_idx[i : i + sz] for i in range(0, len(sort_idx), sz)]
207 - max_ck = np.argmax([self.key(ck[0]) for ck in ck_idx]) # find the chunk with the largest key, 217 + max_ck = np.argmax(
208 - ck_idx[0], ck_idx[max_ck] = ck_idx[max_ck], ck_idx[0] # then make sure it goes first. 218 + [self.key(ck[0]) for ck in ck_idx]
209 - sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) if len(ck_idx) > 1 else np.array([], dtype=np.int) 219 + ) # find the chunk with the largest key,
220 + ck_idx[0], ck_idx[max_ck] = (
221 + ck_idx[max_ck],
222 + ck_idx[0],
223 + ) # then make sure it goes first.
224 + sort_idx = (
225 + np.concatenate(np.random.permutation(ck_idx[1:]))
226 + if len(ck_idx) > 1
227 + else np.array([], dtype=np.int)
228 + )
210 sort_idx = np.concatenate((ck_idx[0], sort_idx)) 229 sort_idx = np.concatenate((ck_idx[0], sort_idx))
211 return iter(sort_idx) 230 return iter(sort_idx)
212 231
...@@ -269,7 +288,9 @@ def get_git_info(): ...@@ -269,7 +288,9 @@ def get_git_info():
269 ROUGE_KEYS = ["rouge1", "rouge2", "rougeL"] 288 ROUGE_KEYS = ["rouge1", "rouge2", "rougeL"]
270 289
271 290
272 -def calculate_rouge(output_lns: List[str], reference_lns: List[str], use_stemmer=True) -> Dict: 291 +def calculate_rouge(
292 + output_lns: List[str], reference_lns: List[str], use_stemmer=True
293 +) -> Dict:
273 scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer) 294 scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer)
274 aggregator = scoring.BootstrapAggregator() 295 aggregator = scoring.BootstrapAggregator()
275 296
...@@ -302,7 +323,9 @@ def assert_all_frozen(model): ...@@ -302,7 +323,9 @@ def assert_all_frozen(model):
302 model_grads: List[bool] = list(grad_status(model)) 323 model_grads: List[bool] = list(grad_status(model))
303 n_require_grad = sum(lmap(int, model_grads)) 324 n_require_grad = sum(lmap(int, model_grads))
304 npars = len(model_grads) 325 npars = len(model_grads)
305 - assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad" 326 + assert not any(
327 + model_grads
328 + ), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
306 329
307 330
308 def assert_not_all_frozen(model): 331 def assert_not_all_frozen(model):
......