TrainingByBi-Sent2Vec.ipynb
4.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Untitled0.ipynb",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gUZeRq7OjdB0",
"outputId": "e4b56ee0-f158-431d-c2c4-6bbe40be0d4e"
},
"source": [
"!git clone https://github.com/epfml/Bi-Sent2Vec.git"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Cloning into 'Bi-Sent2Vec'...\n",
"remote: Enumerating objects: 55, done.\u001b[K\n",
"remote: Counting objects: 100% (55/55), done.\u001b[K\n",
"remote: Compressing objects: 100% (45/45), done.\u001b[K\n",
"remote: Total 55 (delta 17), reused 35 (delta 7), pack-reused 0\u001b[K\n",
"Unpacking objects: 100% (55/55), done.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "X-AKTcejjmGE",
"outputId": "01e68019-ab89-4645-e60d-6d933f025a8e"
},
"source": [
"!cd Bi-Sent2Vec/ && make"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"c++ -pthread -std=c++0x -O3 -funroll-loops -c src/args.cc\n",
"c++ -pthread -std=c++0x -O3 -funroll-loops -c src/dictionary.cc\n",
"c++ -pthread -std=c++0x -O3 -funroll-loops -c src/productquantizer.cc\n",
"c++ -pthread -std=c++0x -O3 -funroll-loops -c src/matrix.cc\n",
"c++ -pthread -std=c++0x -O3 -funroll-loops -c src/shmem_matrix.cc\n",
"c++ -pthread -std=c++0x -O3 -funroll-loops -c src/qmatrix.cc\n",
"c++ -pthread -std=c++0x -O3 -funroll-loops -c src/vector.cc\n",
"c++ -pthread -std=c++0x -O3 -funroll-loops -c src/model.cc\n",
"c++ -pthread -std=c++0x -O3 -funroll-loops -c src/utils.cc\n",
"c++ -pthread -std=c++0x -O3 -funroll-loops -c src/fasttext.cc\n",
"c++ -pthread -std=c++0x -O3 -funroll-loops args.o dictionary.o productquantizer.o matrix.o shmem_matrix.o qmatrix.o vector.o model.o utils.o fasttext.o src/main.cc -o fasttext -lrt\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Jb8Fy8vekIVL",
"outputId": "bfc5acc5-fe30-4f1e-dfaf-1285fd59afea"
},
"source": [
"!cd Bi-Sent2Vec/ && ./fasttext bisent2vec -input ../drive/MyDrive/TrainingdataWithPOSOnlyKo.txt -output ../drive/MyDrive/lastModel -dim 300 -lr 0.2 -neg 10 -bucket 2000000 -maxVocabSize 750000 -thread 30 -t 0.000005 -epoch 5 -minCount 8 -dropoutK 4 -loss ns -wordNgrams 2 -numCheckPoints 5"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Read 95M words\n",
"Number of words: 118144\n",
"Number of labels: 0\n",
"tcmalloc: large alloc 2541780992 bytes == 0x55d87b74a000 @ 0x7f822311c887 0x55d86f321a9c 0x55d86f33e278 0x55d86f33c0ba 0x55d86f33fec5 0x55d86f308632 0x7f8221fb1bf7 0x55d86f3088fa\n",
"Progress: 20.0% words/sec/thread: 72927 lr: 0.159999 loss: 1.837360 eta: 0h2m \n",
"Saving Model ----- Checkpoint 1\n",
"Progress: 40.0% words/sec/thread: 71615 lr: 0.119999 loss: 1.784353 eta: 0h2m \n",
"Saving Model ----- Checkpoint 2\n",
"Progress: 60.0% words/sec/thread: 70740 lr: 0.080000 loss: 1.709536 eta: 0h1m \n",
"Saving Model ----- Checkpoint 3\n",
"Progress: 80.0% words/sec/thread: 70405 lr: 0.039999 loss: 1.627526 eta: 0h0m \n",
"Saving Model ----- Checkpoint 4\n",
"Progress: 100.0% words/sec/thread: 70414 lr: 0.000000 loss: 1.567680 eta: 0h0m \n"
],
"name": "stdout"
}
]
}
]
}