graykode

(add) support other language in dockerfile and gitparser

...@@ -46,17 +46,15 @@ Recommended Commit Message : Remove unused imports ...@@ -46,17 +46,15 @@ Recommended Commit Message : Remove unused imports
46 To solve this problem, use a new embedding called [`patch_type_embeddings`](https://github.com/graykode/commit-autosuggestions/blob/master/commit/model/diff_roberta.py#L40) that can distinguish added and deleted, just as the sample et al, 2019 (XLM) used language embeddeding. (1 for added, 2 for deleted.) 46 To solve this problem, use a new embedding called [`patch_type_embeddings`](https://github.com/graykode/commit-autosuggestions/blob/master/commit/model/diff_roberta.py#L40) that can distinguish added and deleted, just as the sample et al, 2019 (XLM) used language embeddeding. (1 for added, 2 for deleted.)
47 47
48 ### Language support 48 ### Language support
49 -| Language | Added | Diff | 49 +| Language | Added | Diff | Data(Diff) | Weights |
50 -| :------------- | :---: | :---:| 50 +| :------------- | :---: | :---:| :---: | :---:|
51 -| Python | ✅ | ✅ | 51 +| Python | ✅ | ✅ | [link](https://drive.google.com/drive/folders/1_8lQmzTH95Nc-4MKd1RP3x4BVc8tBA6W?usp=sharing) | [link](https://drive.google.com/drive/folders/1OwM7_FiLiwVJAhAanBPWtPw3Hz3Dszbh?usp=sharing) |
52 -| JavaScript | ⬜ | ⬜ | 52 +| JavaScript | ⬜ | ⬜ | ⬜ | ⬜ |
53 -| Go | ⬜ | ⬜ | 53 +| Go | ⬜ | ⬜ | ⬜ | ⬜ |
54 -| JAVA | ⬜ | ⬜ | 54 +| JAVA | ⬜ | ⬜ | ⬜ | ⬜ |
55 -| Ruby | ⬜ | ⬜ | 55 +| Ruby | ⬜ | ⬜ | ⬜ | ⬜ |
56 -| PHP | ⬜ | ⬜ | 56 +| PHP | ⬜ | ⬜ | ⬜ | ⬜ |
57 * ✅ — Supported 57 * ✅ — Supported
58 -* 🔶 — Partial support
59 -* 🚧 — Under development
60 * ⬜ - N/A ️ 58 * ⬜ - N/A ️
61 59
62 We plan to slowly conquer languages that are not currently supported. However, I also need to use expensive GPU instances of AWS or GCP to train about the above languages. Please do a simple sponsor for this! 60 We plan to slowly conquer languages that are not currently supported. However, I also need to use expensive GPU instances of AWS or GCP to train about the above languages. Please do a simple sponsor for this!
...@@ -68,9 +66,18 @@ To run this project, you need a flask-based inference server (GPU) and a client ...@@ -68,9 +66,18 @@ To run this project, you need a flask-based inference server (GPU) and a client
68 Prepare Docker and Nvidia-docker before running the server. 66 Prepare Docker and Nvidia-docker before running the server.
69 67
70 ##### 1-a. If you have GPU machine. 68 ##### 1-a. If you have GPU machine.
71 -Serve flask server with Nvidia Docker 69 +Serve flask server with Nvidia Docker. Check the docker tag for programming language in [here](https://hub.docker.com/repository/registry-1.docker.io/graykode/commit-autosuggestions/tags).
70 +| Language | Tag |
71 +| :------------- | :---: |
72 +| Python | py |
73 +| JavaScript | js |
74 +| Go | go |
75 +| JAVA | java |
76 +| Ruby | ruby |
77 +| PHP | php |
78 +
72 ```shell script 79 ```shell script
73 -$ docker run -it --gpus 0 -p 5000:5000 commit-autosuggestions:0.1-gpu 80 +$ docker run -it -d --gpus 0 -p 5000:5000 graykode/commit-autosuggestions:{language}
74 ``` 81 ```
75 82
76 ##### 1-b. If you don't have GPU machine. 83 ##### 1-b. If you don't have GPU machine.
......
...@@ -10,14 +10,14 @@ ARG ADDED_MODEL="1YrkwfM-0VBCJaa9NYaXUQPODdGPsmQY4" ...@@ -10,14 +10,14 @@ ARG ADDED_MODEL="1YrkwfM-0VBCJaa9NYaXUQPODdGPsmQY4"
10 ARG DIFF_MODEL="1--gcVVix92_Fp75A-mWH0pJS0ahlni5m" 10 ARG DIFF_MODEL="1--gcVVix92_Fp75A-mWH0pJS0ahlni5m"
11 11
12 RUN git clone https://github.com/graykode/commit-autosuggestions.git /app/commit-autosuggestions \ 12 RUN git clone https://github.com/graykode/commit-autosuggestions.git /app/commit-autosuggestions \
13 - && cd /app/commit-autosuggestions && python3 setup.py install 13 + && cd /app/commit-autosuggestions
14 14
15 WORKDIR /app/commit-autosuggestions 15 WORKDIR /app/commit-autosuggestions
16 16
17 RUN pip3 install ${PYTORCH_WHEEL} gdown 17 RUN pip3 install ${PYTORCH_WHEEL} gdown
18 -RUN gdown https://drive.google.com/uc?id=${ADDED_MODEL} -O weight/added/ 18 +RUN gdown https://drive.google.com/uc?id=${ADDED_MODEL} -O weight/python/added/
19 -RUN gdown https://drive.google.com/uc?id=${DIFF_MODEL} -O weight/diff/ 19 +RUN gdown https://drive.google.com/uc?id=${DIFF_MODEL} -O weight/python/diff/
20 20
21 RUN pip3 install -r requirements.txt 21 RUN pip3 install -r requirements.txt
22 22
23 -ENTRYPOINT ["python3", "app.py"] 23 +ENTRYPOINT ["python3", "app.py", "--load_model_path", "./weights/python/"]
......
...@@ -24,6 +24,15 @@ from multiprocessing.pool import Pool ...@@ -24,6 +24,15 @@ from multiprocessing.pool import Pool
24 from transformers import RobertaTokenizer 24 from transformers import RobertaTokenizer
25 from pydriller import RepositoryMining 25 from pydriller import RepositoryMining
26 26
27 +language = {
28 + 'py' : ['.py'],
29 + 'js' : ['.js', '.ts'],
30 + 'go' : ['.go'],
31 + 'java' : ['.java'],
32 + 'ruby' : ['.rb'],
33 + 'php' : ['.php']
34 +}
35 +
27 def message_cleaner(message): 36 def message_cleaner(message):
28 msg = message.split("\n")[0] 37 msg = message.split("\n")[0]
29 msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg) 38 msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg)
...@@ -34,7 +43,7 @@ def jobs(repo, args): ...@@ -34,7 +43,7 @@ def jobs(repo, args):
34 repo_path = os.path.join(args.repos_dir, repo) 43 repo_path = os.path.join(args.repos_dir, repo)
35 if os.path.exists(repo_path): 44 if os.path.exists(repo_path):
36 for commit in RepositoryMining( 45 for commit in RepositoryMining(
37 - repo_path, only_modifications_with_file_types=['.py'] 46 + repo_path, only_modifications_with_file_types=language[args.lang]
38 ).traverse_commits(): 47 ).traverse_commits():
39 cleaned_message = message_cleaner(commit.msg) 48 cleaned_message = message_cleaner(commit.msg)
40 tokenized_message = args.tokenizer.tokenize(cleaned_message) 49 tokenized_message = args.tokenizer.tokenize(cleaned_message)
...@@ -44,7 +53,7 @@ def jobs(repo, args): ...@@ -44,7 +53,7 @@ def jobs(repo, args):
44 for mod in commit.modifications: 53 for mod in commit.modifications:
45 if not (mod.old_path and mod.new_path): 54 if not (mod.old_path and mod.new_path):
46 continue 55 continue
47 - if os.path.splitext(mod.new_path)[1] != '.py': 56 + if os.path.splitext(mod.new_path)[1] not in language[args.lang]:
48 continue 57 continue
49 if not mod.diff_parsed["added"]: 58 if not mod.diff_parsed["added"]:
50 continue 59 continue
...@@ -121,6 +130,9 @@ if __name__ == "__main__": ...@@ -121,6 +130,9 @@ if __name__ == "__main__":
121 help="directory that all repositories had been downloaded.",) 130 help="directory that all repositories had been downloaded.",)
122 parser.add_argument("--output_dir", type=str, required=True, 131 parser.add_argument("--output_dir", type=str, required=True,
123 help="The output directory where the preprocessed data will be written.") 132 help="The output directory where the preprocessed data will be written.")
133 + parser.add_argument("--lang", type=str, required=True,
134 + choices=['py', 'js', 'go', 'java', 'ruby', 'php'],
135 + help="The output directory where the preprocessed data will be written.")
124 parser.add_argument("--tokenizer_name", type=str, 136 parser.add_argument("--tokenizer_name", type=str,
125 default="microsoft/codebert-base", help="The name of tokenizer",) 137 default="microsoft/codebert-base", help="The name of tokenizer",)
126 parser.add_argument("--num_workers", default=4, type=int, help="number of process") 138 parser.add_argument("--num_workers", default=4, type=int, help="number of process")
......