franz96521 commited on
Commit
a53f410
1 Parent(s): 3393cd3
.gitattributes CHANGED
@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ models/** filter=lfs diff=lfs merge=lfs -text
29
+ AbstractGenerator/** filter=lfs diff=lfs merge=lfs -text
AbstractGenerator.ipynb ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "WARNING:tensorflow:From C:\\Users\\franz\\AppData\\Local\\Temp\\ipykernel_14092\\1198363771.py:6: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.\n",
13
+ "Instructions for updating:\n",
14
+ "Use `tf.config.list_physical_devices('GPU')` instead.\n",
15
+ "GPU is available\n"
16
+ ]
17
+ }
18
+ ],
19
+ "source": [
20
+ "\n",
21
+ "import gpt_2_simple as gpt2\n",
22
+ "import os\n",
23
+ "import tensorflow as tf\n",
24
+ "import pandas as pd\n",
25
+ "import re\n",
26
+ "print(\"GPU is\", \"available\" if tf.test.is_gpu_available() else \"NOT AVAILABLE\")"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 2,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "model_name = \"124M\"\n",
36
+ "if not os.path.isdir(os.path.join(\"models\", model_name)):\n",
37
+ "\tprint(f\"Downloading {model_name} model...\")\n",
38
+ "\tgpt2.download_gpt2(model_name=model_name) "
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 3,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "path = 'AbstractGenerator/'\n",
48
+ "checkpoint_dir =path+'weights/'\n",
49
+ "data_path = path+'TrainigData/'\n",
50
+ "\n",
51
+ "\n",
52
+ "\n",
53
+ "file_name_en = 'en'\n",
54
+ "file_path_en = data_path+file_name_en\n",
55
+ "\n",
56
+ "file_name_es = 'es'\n",
57
+ "file_path_es = data_path+file_name_es\n",
58
+ "\n",
59
+ "\n",
60
+ "prefix= '<|startoftext|>'\n",
61
+ "sufix ='<|endoftext|>'"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "markdown",
66
+ "metadata": {},
67
+ "source": [
68
+ "# create trainig data"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 13,
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": [
77
+ "en = pd.read_csv('CSV\\scientific_paper_en.csv')[0:1000]\n",
78
+ "es = pd.read_csv('CSV\\scientific_paper_es.csv')[0:1000]"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": 14,
84
+ "metadata": {},
85
+ "outputs": [],
86
+ "source": [
87
+ "import codecs\n",
88
+ "def createTrainingData(ds,fileName= 'resumen.txt' ,path ='TrainigData/'):\n",
89
+ " with codecs.open(path+fileName,'a','utf-8') as f:\n",
90
+ " for i in ds.index:\n",
91
+ " f.write(prefix+\"\\n\")\n",
92
+ " f.write(ds.iloc[i]['text_no_abstract'])\n",
93
+ " f.write(\"ABSTRACT\\n\")\n",
94
+ " f.write(ds.iloc[i]['abstract']+\"\\n\")\n",
95
+ " f.write(sufix)\n",
96
+ " "
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 15,
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "createTrainingData(en,'en.txt',data_path)\n",
106
+ "createTrainingData(es,'es.txt',data_path)"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "markdown",
111
+ "metadata": {},
112
+ "source": [
113
+ "# pretrained"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": null,
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": [
122
+ "sess = gpt2.start_tf_sess()\n",
123
+ "gpt2.load_gpt2(sess,checkpoint_dir=checkpoint_dir,run_name='run1')"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "markdown",
128
+ "metadata": {},
129
+ "source": [
130
+ "# train "
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": 16,
136
+ "metadata": {},
137
+ "outputs": [],
138
+ "source": [
139
+ "tf.compat.v1.reset_default_graph()\n",
140
+ "sess = gpt2.start_tf_sess()"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "markdown",
145
+ "metadata": {},
146
+ "source": [
147
+ "## en"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": null,
153
+ "metadata": {},
154
+ "outputs": [],
155
+ "source": [
156
+ "gpt2.finetune(sess,\n",
157
+ " file_path_en+'.txt',\n",
158
+ " model_name=model_name,\n",
159
+ " checkpoint_dir=checkpoint_dir, \n",
160
+ " steps=1000\n",
161
+ " ) "
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "markdown",
166
+ "metadata": {},
167
+ "source": [
168
+ "## es"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": 17,
174
+ "metadata": {},
175
+ "outputs": [
176
+ {
177
+ "name": "stdout",
178
+ "output_type": "stream",
179
+ "text": [
180
+ "Loading checkpoint models\\124M\\model.ckpt\n",
181
+ "INFO:tensorflow:Restoring parameters from models\\124M\\model.ckpt\n",
182
+ "Loading dataset...\n"
183
+ ]
184
+ },
185
+ {
186
+ "name": "stderr",
187
+ "output_type": "stream",
188
+ "text": [
189
+ "100%|██████████| 1/1 [00:51<00:00, 51.03s/it]\n"
190
+ ]
191
+ },
192
+ {
193
+ "name": "stdout",
194
+ "output_type": "stream",
195
+ "text": [
196
+ "dataset has 17511492 tokens\n",
197
+ "Training...\n"
198
+ ]
199
+ },
200
+ {
201
+ "ename": "ResourceExhaustedError",
202
+ "evalue": "Graph execution error:\n\nfailed to allocate memory\n\t [[{{node model/h10/attn/ArithmeticOptimizer/ReorderCastLikeAndValuePreserving_float_Cast_1}}]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.",
203
+ "output_type": "error",
204
+ "traceback": [
205
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
206
+ "\u001b[1;31mResourceExhaustedError\u001b[0m Traceback (most recent call last)",
207
+ "File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:1377\u001b[0m, in \u001b[0;36mBaseSession._do_call\u001b[1;34m(self, fn, *args)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1375'>1376</a>\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1376'>1377</a>\u001b[0m \u001b[39mreturn\u001b[39;00m fn(\u001b[39m*\u001b[39;49margs)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1377'>1378</a>\u001b[0m \u001b[39mexcept\u001b[39;00m errors\u001b[39m.\u001b[39mOpError \u001b[39mas\u001b[39;00m e:\n",
208
+ "File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:1360\u001b[0m, in \u001b[0;36mBaseSession._do_run.<locals>._run_fn\u001b[1;34m(feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1358'>1359</a>\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_extend_graph()\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1359'>1360</a>\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_call_tf_sessionrun(options, feed_dict, fetch_list,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1360'>1361</a>\u001b[0m target_list, run_metadata)\n",
209
+ "File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:1453\u001b[0m, in \u001b[0;36mBaseSession._call_tf_sessionrun\u001b[1;34m(self, options, feed_dict, fetch_list, target_list, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1450'>1451</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_call_tf_sessionrun\u001b[39m(\u001b[39mself\u001b[39m, options, feed_dict, fetch_list, target_list,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1451'>1452</a>\u001b[0m run_metadata):\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1452'>1453</a>\u001b[0m \u001b[39mreturn\u001b[39;00m tf_session\u001b[39m.\u001b[39;49mTF_SessionRun_wrapper(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_session, options, feed_dict,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1453'>1454</a>\u001b[0m fetch_list, target_list,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1454'>1455</a>\u001b[0m run_metadata)\n",
210
+ "\u001b[1;31mResourceExhaustedError\u001b[0m: failed to allocate memory\n\t [[{{node model/h10/attn/ArithmeticOptimizer/ReorderCastLikeAndValuePreserving_float_Cast_1}}]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.\n",
211
+ "\nDuring handling of the above exception, another exception occurred:\n",
212
+ "\u001b[1;31mResourceExhaustedError\u001b[0m Traceback (most recent call last)",
213
+ "\u001b[1;32mc:\\Users\\franz\\OneDrive\\Documentos\\GitHub\\Generador-de-abstracts\\AbstractGenerator.ipynb Cell 15'\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=0'>1</a>\u001b[0m gpt2\u001b[39m.\u001b[39;49mfinetune(sess,\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=1'>2</a>\u001b[0m file_path_es\u001b[39m+\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m.txt\u001b[39;49m\u001b[39m'\u001b[39;49m,\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=2'>3</a>\u001b[0m model_name\u001b[39m=\u001b[39;49mmodel_name,\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=3'>4</a>\u001b[0m checkpoint_dir\u001b[39m=\u001b[39;49mcheckpoint_dir, \n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=4'>5</a>\u001b[0m steps\u001b[39m=\u001b[39;49m\u001b[39m1000\u001b[39;49m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=5'>6</a>\u001b[0m )\n",
214
+ "File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\gpt_2_simple\\gpt_2.py:339\u001b[0m, in \u001b[0;36mfinetune\u001b[1;34m(sess, dataset, steps, model_name, model_dir, combine, batch_size, learning_rate, accumulate_gradients, restore_from, run_name, checkpoint_dir, sample_every, sample_length, sample_num, multi_gpu, save_every, print_every, max_checkpoints, use_memory_saving_gradients, only_train_transformer_layers, optimizer, overwrite, reuse)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/gpt_2.py?line=336'>337</a>\u001b[0m sess\u001b[39m.\u001b[39mrun(opt_reset)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/gpt_2.py?line=337'>338</a>\u001b[0m \u001b[39mfor\u001b[39;00m _ \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(accumulate_gradients):\n\u001b[1;32m--> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/gpt_2.py?line=338'>339</a>\u001b[0m sess\u001b[39m.\u001b[39;49mrun(\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/gpt_2.py?line=339'>340</a>\u001b[0m opt_compute, feed_dict\u001b[39m=\u001b[39;49m{context: sample_batch()})\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/gpt_2.py?line=340'>341</a>\u001b[0m (v_loss, v_summary) \u001b[39m=\u001b[39m sess\u001b[39m.\u001b[39mrun((opt_apply, summary_loss))\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/gpt_2.py?line=341'>342</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n",
215
+ "File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:967\u001b[0m, in \u001b[0;36mBaseSession.run\u001b[1;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=963'>964</a>\u001b[0m run_metadata_ptr \u001b[39m=\u001b[39m tf_session\u001b[39m.\u001b[39mTF_NewBuffer() \u001b[39mif\u001b[39;00m run_metadata \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=965'>966</a>\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=966'>967</a>\u001b[0m result \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_run(\u001b[39mNone\u001b[39;49;00m, fetches, feed_dict, options_ptr,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=967'>968</a>\u001b[0m run_metadata_ptr)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=968'>969</a>\u001b[0m \u001b[39mif\u001b[39;00m run_metadata:\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=969'>970</a>\u001b[0m proto_data \u001b[39m=\u001b[39m tf_session\u001b[39m.\u001b[39mTF_GetBuffer(run_metadata_ptr)\n",
216
+ "File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:1190\u001b[0m, in \u001b[0;36mBaseSession._run\u001b[1;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1186'>1187</a>\u001b[0m \u001b[39m# We only want to really perform the run if fetches or targets are provided,\u001b[39;00m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1187'>1188</a>\u001b[0m \u001b[39m# or if the call is a partial run that specifies feeds.\u001b[39;00m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1188'>1189</a>\u001b[0m \u001b[39mif\u001b[39;00m final_fetches \u001b[39mor\u001b[39;00m final_targets \u001b[39mor\u001b[39;00m (handle \u001b[39mand\u001b[39;00m feed_dict_tensor):\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1189'>1190</a>\u001b[0m results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_do_run(handle, final_targets, final_fetches,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1190'>1191</a>\u001b[0m feed_dict_tensor, options, run_metadata)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1191'>1192</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1192'>1193</a>\u001b[0m results \u001b[39m=\u001b[39m []\n",
217
+ "File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:1370\u001b[0m, in \u001b[0;36mBaseSession._do_run\u001b[1;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1366'>1367</a>\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_call_tf_sessionprun(handle, feed_dict, fetch_list)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1368'>1369</a>\u001b[0m \u001b[39mif\u001b[39;00m handle \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1369'>1370</a>\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_do_call(_run_fn, feeds, fetches, targets, options,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1370'>1371</a>\u001b[0m run_metadata)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1371'>1372</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1372'>1373</a>\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_do_call(_prun_fn, handle, feeds, fetches)\n",
218
+ "File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:1396\u001b[0m, in \u001b[0;36mBaseSession._do_call\u001b[1;34m(self, fn, *args)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1390'>1391</a>\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m'\u001b[39m\u001b[39monly supports NHWC tensor format\u001b[39m\u001b[39m'\u001b[39m \u001b[39min\u001b[39;00m message:\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1391'>1392</a>\u001b[0m message \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (\u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mA possible workaround: Try disabling Grappler optimizer\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1392'>1393</a>\u001b[0m \u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mby modifying the config for creating the session eg.\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1393'>1394</a>\u001b[0m \u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39msession_config.graph_options.rewrite_options.\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1394'>1395</a>\u001b[0m \u001b[39m'\u001b[39m\u001b[39mdisable_meta_optimizer = True\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1395'>1396</a>\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mtype\u001b[39m(e)(node_def, op, message)\n",
219
+ "\u001b[1;31mResourceExhaustedError\u001b[0m: Graph execution error:\n\nfailed to allocate memory\n\t [[{{node model/h10/attn/ArithmeticOptimizer/ReorderCastLikeAndValuePreserving_float_Cast_1}}]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode."
220
+ ]
221
+ }
222
+ ],
223
+ "source": [
224
+ "gpt2.finetune(sess,\n",
225
+ " file_path_es+'.txt',\n",
226
+ " model_name=model_name,\n",
227
+ " checkpoint_dir=checkpoint_dir, \n",
228
+ " steps=1000\n",
229
+ " ) "
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "markdown",
234
+ "metadata": {},
235
+ "source": [
236
+ "# test"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "markdown",
241
+ "metadata": {},
242
+ "source": [
243
+ "## en "
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": null,
249
+ "metadata": {},
250
+ "outputs": [],
251
+ "source": [
252
+ "text = \"\"\"Introduction and preliminaries\n",
253
+ "The focus of this paper is decompositions of (k, `)-sparse graphs into edge-disjoint subgraphs\n",
254
+ "that certify sparsity. We use graph to mean a multigraph, possibly with loops. We say that a\n",
255
+ "graph is (k, `)-sparse if no subset of n′ vertices spans more than kn′− ` edges in the graph; a\n",
256
+ "(k, `)-sparse graph with kn′− ` edges is (k, `)-tight. We call the range k ≤ `≤ 2k−1 the upper\n",
257
+ "range of sparse graphs and 0≤ `≤ k the lower range.\n",
258
+ "In this paper, we present efficient algorithms for finding decompositions that certify sparsity\n",
259
+ "in the upper range of `. Our algorithms also apply in the lower range, which was already ad-\n",
260
+ "dressed by [3, 4, 5, 6, 19]. A decomposition certifies the sparsity of a graph if the sparse graphs\n",
261
+ "and graphs admitting the decomposition coincide.\n",
262
+ "Our algorithms are based on a new characterization of sparse graphs, which we call the\n",
263
+ "pebble game with colors. The pebble game with colors is a simple graph construction rule that\n",
264
+ "produces a sparse graph along with a sparsity-certifying decomposition.\n",
265
+ "We define and study a canonical class of pebble game constructions, which correspond to\n",
266
+ "previously studied decompositions of sparse graphs into edge disjoint trees. Our results provide\n",
267
+ "a unifying framework for all the previously known special cases, including Nash-Williams-\n",
268
+ "Tutte and [7, 24]. Indeed, in the lower range, canonical pebble game constructions capture the\n",
269
+ "properties of the augmenting paths used in matroid union and intersection algorithms[5, 6].\n",
270
+ "Since the sparse graphs in the upper range are not known to be unions or intersections of the\n",
271
+ "matroids for which there are efficient augmenting path algorithms, these do not easily apply in\n",
272
+ "∗ Research of both authors funded by the NSF under grants NSF CCF-0430990 and NSF-DARPA CARGO\n",
273
+ "CCR-0310661 to the first author.\n",
274
+ "2 Ileana Streinu, Louis Theran\n",
275
+ "Term Meaning\n",
276
+ "Sparse graph G Every non-empty subgraph on n′ vertices has ≤ kn′− ` edges\n",
277
+ "Tight graph G G = (V,E) is sparse and |V |= n, |E|= kn− `\n",
278
+ "Block H in G G is sparse, and H is a tight subgraph\n",
279
+ "Component H of G G is sparse and H is a maximal block\n",
280
+ "Map-graph Graph that admits an out-degree-exactly-one orientation\n",
281
+ "(k, `)-maps-and-trees Edge-disjoint union of ` trees and (k− `) map-grpahs\n",
282
+ "`Tk Union of ` trees, each vertex is in exactly k of them\n",
283
+ "Set of tree-pieces of an `Tk induced on V ′ ⊂V Pieces of trees in the `Tk spanned by E(V ′)\n",
284
+ "Proper `Tk Every V ′ ⊂V contains ≥ ` pieces of trees from the `Tk\n",
285
+ "Table 1. Sparse graph and decomposition terminology used in this paper.\n",
286
+ "the upper range. Pebble game with colors constructions may thus be considered a strengthening\n",
287
+ "of augmenting paths to the upper range of matroidal sparse graphs.\n",
288
+ "1.1. Sparse graphs\n",
289
+ "\n",
290
+ "ABSTRACT\n",
291
+ "\"\"\""
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": null,
297
+ "metadata": {},
298
+ "outputs": [],
299
+ "source": [
300
+ "gpt2.generate(sess,prefix=text,truncate=sufix,checkpoint_dir=checkpoint_dir,nsamples=1)"
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "markdown",
305
+ "metadata": {},
306
+ "source": [
307
+ "## es"
308
+ ]
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": null,
313
+ "metadata": {},
314
+ "outputs": [],
315
+ "source": [
316
+ "text = \"\"\"El foco de este documento son las descomposicións de (k, `)-sparse gráficos en bordes-disjunto subgraphs\n",
317
+ "que certifique la escasez. Usamos el gráfico para significar un múltiplo, posiblemente con bucles. Nosotros decimos que un\n",
318
+ "grafo es (k, `)-sparse si ningún subconjunto de n′ vértices abarca más de kn ` bordes en el gráfico; a\n",
319
+ "(k, `)-sparse gráfico con kn ` bordes es (k, `)-estrechado. Llamamos al rango k ≤ 2k−1 el superior\n",
320
+ "rango de gráficos escasos y 0≤ k el rango inferior.\n",
321
+ "En este artículo, presentamos algoritmos eficientes para encontrar descomposicións que certifiquen la escasez\n",
322
+ "en el rango superior de `. Nuestros algoritmos también se aplican en el rango inferior, que ya era ad-\n",
323
+ "vestido por [3, 4, 5, 6, 19]. Una descomposición certifica la escasez de un gráfico si los gráficos dispersos\n",
324
+ "y los gráficos que admiten la descomposición coinciden.\n",
325
+ "Nuestros algoritmos se basan en una nueva caracterización de gráficos escasos, que llamamos el\n",
326
+ "juego de guijarros con colores. El juego de guijarros con colores es una regla de construcción de gráficos simples que\n",
327
+ "produce un gráfico escaso junto con una descomposición certificadora de la escasez.\n",
328
+ "Definimos y estudiamos una clase canónica de construcciones de juego de guijarros, que corresponden a\n",
329
+ "previamente estudiado las descomposiciones de los gráficos escasos en los árboles disjuntos del borde. Nuestros resultados proporcionan\n",
330
+ "un marco unificador para todos los casos especiales conocidos anteriormente, incluidos Nash-Williams-\n",
331
+ "Tutte y [7, 24]. De hecho, en el rango inferior, las construcciones canónicas de juego de guijarros capturan la\n",
332
+ "propiedades de las rutas de aumento utilizadas en los algoritmos de unión de matroides y de intersección[5, 6].\n",
333
+ "Dado que los gráficos escasos en el rango superior no se sabe que son uniones o intersecciones de la\n",
334
+ "matroides para los que hay algoritmos de ruta de aumento eficiente, estos no se aplican fácilmente en\n",
335
+ "* Investigación de ambos autores financiada por la NSF bajo subvenciones NSF CCF-0430990 y NSF-DARPA CARGO\n",
336
+ "CCR-0310661 al primer autor.\n",
337
+ "2 Ileana Streinu, Louis Theran\n",
338
+ "Significado del término\n",
339
+ "Gráfico escaso G Cada subgrafo no vacío en n′ vértices tiene ≤ kn ` bordes\n",
340
+ "El gráfico ajustado G G = (V,E) es escaso y V = n, E= kn− `\n",
341
+ "El bloque H en G G es escaso, y H es un subgrafo apretado\n",
342
+ "El componente H de G G es escaso y H es un bloqueo máximo\n",
343
+ "Gráfico cartográfico que admite una orientación de grado-exactamente-uno\n",
344
+ "(k, `)-maps-and-trees Edge-disjunt union de ` árboles y (k- `) map-grpahs\n",
345
+ "`Tk Unión de ` árboles, cada vértice está exactamente en k de ellos\n",
346
+ "Conjunto de piezas arbóreas de un `Tk inducido en V ′ ́V Piezas de árboles en el `Tk extendido por E(V ′)\n",
347
+ "`Tk Apropiado Cada V ′ V contiene ≥ ` pedazos de árboles de la `Tk\n",
348
+ "Cuadro 1 Gráfico escaso y terminología de descomposición utilizada en este artículo.\n",
349
+ "el rango superior. Pebble juego con construcciones de colores por lo tanto puede ser considerado un fortalecimiento\n",
350
+ "de caminos de aumento a la gama superior de gráficos de la escasez matroidal.\n",
351
+ "1.1. Gráficos escasos\n",
352
+ "Un gráfico es (k, `)-sparse si para cualquier subgrafo no vacío con bordes m′ y n′ vértices, m′ ≤\n",
353
+ "kn `. Observamos que esta condición implica que 0 ≤ ` ≤ 2k− 1, y a partir de ahora en este\n",
354
+ "Haremos esta suposición. Un gráfico escaso que tiene n vértices y exactamente bordes kn\n",
355
+ "se llama apretado.\n",
356
+ "Para un gráfico G = (V,E), y V ′ V, utilizamos el intervalo de notación (V ′) para el número de bordes\n",
357
+ "en el subgráfico inducido por V ′. En un gráfico dirigido, out(V ′) es el número de bordes con la cola\n",
358
+ "en V ′ y la cabeza en V −V ′; para un subgráfico inducido por V ′, llamamos a tal borde un borde superior.\n",
359
+ "Hay dos tipos importantes de subgrafías de gráficos escasos. Un bloque es un subgrafo apretado de\n",
360
+ "un gráfico escaso. Un componente es un bloque máximo.\n",
361
+ "La Tabla 1 resume la escasa terminología gráfica utilizada en este artículo.\n",
362
+ "1.2. Descomposiciónes de certificación de la sparsidad\n",
363
+ "Un k-arborescencia es un gráfico que admite una descomposición en k borde-desjunto que abarca los árboles.\n",
364
+ "La Figura 1(a) muestra un ejemplo de una 3-arborescencia. Se describen los gráficos k-arborescentes\n",
365
+ "por los conocidos teoremas de Tutte [23] y Nash-Williams [17] como exactamente el (k,k) apretado\n",
366
+ "gráficos.\n",
367
+ "ABSTRACT\n",
368
+ "\"\"\""
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": null,
374
+ "metadata": {},
375
+ "outputs": [],
376
+ "source": [
377
+ "gpt2.generate(sess,prefix=text,truncate=sufix,checkpoint_dir=checkpoint_dir,nsamples=1)"
378
+ ]
379
+ },
380
+ {
381
+ "cell_type": "markdown",
382
+ "metadata": {},
383
+ "source": [
384
+ "# gradio interface"
385
+ ]
386
+ },
387
+ {
388
+ "cell_type": "code",
389
+ "execution_count": null,
390
+ "metadata": {},
391
+ "outputs": [],
392
+ "source": [
393
+ "def generateAbstract(text):\n",
394
+ " # with tf.compat.v1.variable_scope(\"weight\", reuse = True):\n",
395
+ " #sess = tf.compat.v1.get_variable('sess',gpt2.start_tf_sess())\n",
396
+ " tf.compat.v1.reset_default_graph()\n",
397
+ " sess = gpt2.start_tf_sess()\n",
398
+ " gpt2.load_gpt2(sess,checkpoint_dir=checkpoint_dir,run_name='run1')\n",
399
+ " txt = gpt2.generate(sess,prefix=str(text)+\"\\nABSTRACT\", return_as_list=True,truncate=sufix,checkpoint_dir=checkpoint_dir,nsamples=1)[0]\n",
400
+ " return str(txt[txt.find('ABSTRACT'):])\n",
401
+ "\n",
402
+ "\n",
403
+ "\n",
404
+ "iface = gr.Interface(fn=generateAbstract, inputs=gr.inputs.Textbox(lines=10, placeholder=\"text\"), outputs=\"textbox\")\n",
405
+ "iface.launch(debug = True )"
406
+ ]
407
+ }
408
+ ],
409
+ "metadata": {
410
+ "interpreter": {
411
+ "hash": "53fbdc69e3e12c371950068c144423682c30d04ec68c2bd46937202e33e0058d"
412
+ },
413
+ "kernelspec": {
414
+ "display_name": "Python 3.7.11 ('receta')",
415
+ "language": "python",
416
+ "name": "python3"
417
+ },
418
+ "language_info": {
419
+ "codemirror_mode": {
420
+ "name": "ipython",
421
+ "version": 3
422
+ },
423
+ "file_extension": ".py",
424
+ "mimetype": "text/x-python",
425
+ "name": "python",
426
+ "nbconvert_exporter": "python",
427
+ "pygments_lexer": "ipython3",
428
+ "version": "3.9.7"
429
+ },
430
+ "orig_nbformat": 4
431
+ },
432
+ "nbformat": 4,
433
+ "nbformat_minor": 2
434
+ }
AbstractGenerator/TrainigData/en.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:814f983aa49ccc33a993a7d12f67a2eb2a7ca0b15d8697e82b50d3a19f3e1595
3
+ size 35400974
AbstractGenerator/TrainigData/es.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2454067cfe384e1d824b3f5d29cb5c4e1ff292289ad4b37c6cbd22f5cc715295
3
+ size 44460970
AbstractGenerator/weights/run1/encoder.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783
3
+ size 1042301
AbstractGenerator/weights/run1/events.out.tfevents.1648184225.FRANZ96521-W11 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83a88ba7f3268f11289fb24fd13db1367b91acce6466c4ad394011e10ea4c304
3
+ size 82
AbstractGenerator/weights/run1/events.out.tfevents.1648184499.FRANZ96521-W11 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb8646e6bf1e1b8cc26f8128ec4e4c2e797dac297939450a8bf46057e7388a6a
3
+ size 82
AbstractGenerator/weights/run1/events.out.tfevents.1648229481.FRANZ96521-W11 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04d1f71db542da83fee4fe8574bf382cb5324b6decef506206250b8fea85abd0
3
+ size 82
AbstractGenerator/weights/run1/hparams.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9d56e4121c427164e0c55c6f03c08e1daf9002b9b672825112d19097b680318
3
+ size 90
AbstractGenerator/weights/run1/vocab.bpe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5
3
+ size 456318
Descarga.ipynb ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import json\n",
11
+ "from pandas import json_normalize\n",
12
+ "import requests\n",
13
+ "from pathlib import Path\n",
14
+ "from multiprocessing.pool import ThreadPool as Pool\n",
15
+ "import codecs\n",
16
+ "import random\n",
17
+ "import re"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 3,
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "URL_BASE = \"https://arxiv.org/pdf/\"\n",
27
+ "PDF_PATH = 'PDF'\n",
28
+ "TXT_PATH= 'TXT'"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "metadata": {},
34
+ "source": [
35
+ "# Arxiv\n"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 4,
41
+ "metadata": {},
42
+ "outputs": [
43
+ {
44
+ "name": "stdout",
45
+ "output_type": "stream",
46
+ "text": [
47
+ "<class 'pandas.core.frame.DataFrame'>\n"
48
+ ]
49
+ },
50
+ {
51
+ "data": {
52
+ "text/html": [
53
+ "<div>\n",
54
+ "<style scoped>\n",
55
+ " .dataframe tbody tr th:only-of-type {\n",
56
+ " vertical-align: middle;\n",
57
+ " }\n",
58
+ "\n",
59
+ " .dataframe tbody tr th {\n",
60
+ " vertical-align: top;\n",
61
+ " }\n",
62
+ "\n",
63
+ " .dataframe thead th {\n",
64
+ " text-align: right;\n",
65
+ " }\n",
66
+ "</style>\n",
67
+ "<table border=\"1\" class=\"dataframe\">\n",
68
+ " <thead>\n",
69
+ " <tr style=\"text-align: right;\">\n",
70
+ " <th></th>\n",
71
+ " <th>id</th>\n",
72
+ " <th>title</th>\n",
73
+ " <th>abstract</th>\n",
74
+ " <th>Text</th>\n",
75
+ " </tr>\n",
76
+ " </thead>\n",
77
+ " <tbody>\n",
78
+ " <tr>\n",
79
+ " <th>0</th>\n",
80
+ " <td>0704.0001</td>\n",
81
+ " <td>Calculation of prompt diphoton production cros...</td>\n",
82
+ " <td>A fully differential calculation in perturba...</td>\n",
83
+ " <td></td>\n",
84
+ " </tr>\n",
85
+ " <tr>\n",
86
+ " <th>1</th>\n",
87
+ " <td>0704.0002</td>\n",
88
+ " <td>Sparsity-certifying Graph Decompositions</td>\n",
89
+ " <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
90
+ " <td></td>\n",
91
+ " </tr>\n",
92
+ " <tr>\n",
93
+ " <th>2</th>\n",
94
+ " <td>0704.0003</td>\n",
95
+ " <td>The evolution of the Earth-Moon system based o...</td>\n",
96
+ " <td>The evolution of Earth-Moon system is descri...</td>\n",
97
+ " <td></td>\n",
98
+ " </tr>\n",
99
+ " <tr>\n",
100
+ " <th>3</th>\n",
101
+ " <td>0704.0004</td>\n",
102
+ " <td>A determinant of Stirling cycle numbers counts...</td>\n",
103
+ " <td>We show that a determinant of Stirling cycle...</td>\n",
104
+ " <td></td>\n",
105
+ " </tr>\n",
106
+ " <tr>\n",
107
+ " <th>4</th>\n",
108
+ " <td>0704.0005</td>\n",
109
+ " <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
110
+ " <td>In this paper we show how to compute the $\\L...</td>\n",
111
+ " <td></td>\n",
112
+ " </tr>\n",
113
+ " <tr>\n",
114
+ " <th>...</th>\n",
115
+ " <td>...</td>\n",
116
+ " <td>...</td>\n",
117
+ " <td>...</td>\n",
118
+ " <td>...</td>\n",
119
+ " </tr>\n",
120
+ " <tr>\n",
121
+ " <th>1996</th>\n",
122
+ " <td>0704.1997</td>\n",
123
+ " <td>Query on Negative Temperature, Internal Intera...</td>\n",
124
+ " <td>After negative temperature is restated, we f...</td>\n",
125
+ " <td></td>\n",
126
+ " </tr>\n",
127
+ " <tr>\n",
128
+ " <th>1997</th>\n",
129
+ " <td>0704.1998</td>\n",
130
+ " <td>Absence of the Fifth Force Problem in a Model ...</td>\n",
131
+ " <td>A scale invariant model containing dilaton $...</td>\n",
132
+ " <td></td>\n",
133
+ " </tr>\n",
134
+ " <tr>\n",
135
+ " <th>1998</th>\n",
136
+ " <td>0704.1999</td>\n",
137
+ " <td>Dark matter caustics and the enhancement of se...</td>\n",
138
+ " <td>Cold dark matter haloes are populated by cau...</td>\n",
139
+ " <td></td>\n",
140
+ " </tr>\n",
141
+ " <tr>\n",
142
+ " <th>1999</th>\n",
143
+ " <td>0704.2000</td>\n",
144
+ " <td>Search for a Higgs boson produced in associati...</td>\n",
145
+ " <td>We describe a search for the standard model ...</td>\n",
146
+ " <td></td>\n",
147
+ " </tr>\n",
148
+ " <tr>\n",
149
+ " <th>2000</th>\n",
150
+ " <td>0704.2001</td>\n",
151
+ " <td>Geometry of Parallelizable Manifolds in the Co...</td>\n",
152
+ " <td>In this paper, we deal with a generalization...</td>\n",
153
+ " <td></td>\n",
154
+ " </tr>\n",
155
+ " </tbody>\n",
156
+ "</table>\n",
157
+ "<p>2001 rows × 4 columns</p>\n",
158
+ "</div>"
159
+ ],
160
+ "text/plain": [
161
+ " id title \\\n",
162
+ "0 0704.0001 Calculation of prompt diphoton production cros... \n",
163
+ "1 0704.0002 Sparsity-certifying Graph Decompositions \n",
164
+ "2 0704.0003 The evolution of the Earth-Moon system based o... \n",
165
+ "3 0704.0004 A determinant of Stirling cycle numbers counts... \n",
166
+ "4 0704.0005 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
167
+ "... ... ... \n",
168
+ "1996 0704.1997 Query on Negative Temperature, Internal Intera... \n",
169
+ "1997 0704.1998 Absence of the Fifth Force Problem in a Model ... \n",
170
+ "1998 0704.1999 Dark matter caustics and the enhancement of se... \n",
171
+ "1999 0704.2000 Search for a Higgs boson produced in associati... \n",
172
+ "2000 0704.2001 Geometry of Parallelizable Manifolds in the Co... \n",
173
+ "\n",
174
+ " abstract Text \n",
175
+ "0 A fully differential calculation in perturba... \n",
176
+ "1 We describe a new algorithm, the $(k,\\ell)$-... \n",
177
+ "2 The evolution of Earth-Moon system is descri... \n",
178
+ "3 We show that a determinant of Stirling cycle... \n",
179
+ "4 In this paper we show how to compute the $\\L... \n",
180
+ "... ... ... \n",
181
+ "1996 After negative temperature is restated, we f... \n",
182
+ "1997 A scale invariant model containing dilaton $... \n",
183
+ "1998 Cold dark matter haloes are populated by cau... \n",
184
+ "1999 We describe a search for the standard model ... \n",
185
+ "2000 In this paper, we deal with a generalization... \n",
186
+ "\n",
187
+ "[2001 rows x 4 columns]"
188
+ ]
189
+ },
190
+ "execution_count": 4,
191
+ "metadata": {},
192
+ "output_type": "execute_result"
193
+ }
194
+ ],
195
+ "source": [
196
+ "data = pd.read_json('ARxiv/arxiv-metadata-oai-snapshot.json',lines=True, chunksize=2001,dtype={'id':'str'})\n",
197
+ "df = None\n",
198
+ "for i in data:\n",
199
+ " df = i \n",
200
+ " print(type(i))\n",
201
+ " break\n",
202
+ "df = df[['id','title','abstract']]\n",
203
+ "df.insert(3, \"Text\", \"\") \n",
204
+ "df"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": 7,
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": [
213
+ "def GetFileURL(file_id):\n",
214
+ " url = URL_BASE+file_id\n",
215
+ " r = requests.get(url, stream=True) \n",
216
+ " filename = Path(PDF_PATH+'/'+file_id+'.pdf')\n",
217
+ " response = requests.get(url)\n",
218
+ " filename.write_bytes(response.content)"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 35,
224
+ "metadata": {},
225
+ "outputs": [],
226
+ "source": [
227
+ "pool_size = 16 \n",
228
+ "def worker(file):\n",
229
+ " try:\n",
230
+ " GetFileURL(file)\n",
231
+ " except:\n",
232
+ " print('error with item '+ file)\n",
233
+ " try:\n",
234
+ " with codecs.open(PDF_PATH+'/log.txt', 'a') as the_file: \n",
235
+ " the_file.writelines(str(file)+\"\\n\")\n",
236
+ " except:\n",
237
+ " print('error en log '+ file)\n",
238
+ "def get_ids(iteracion,batch=100): \n",
239
+ " inicio = int(iteracion*batch)\n",
240
+ " filesId = data[inicio :inicio + batch]['id']\n",
241
+ " return filesId\n",
242
+ "\n",
243
+ "pool = Pool(pool_size)\n",
244
+ "filesId = get_ids(19)\n",
245
+ "for file in filesId:\n",
246
+ " pool.apply_async(worker, (file,))\n",
247
+ "\n",
248
+ "pool.close()\n",
249
+ "pool.join()"
250
+ ]
251
+ }
252
+ ],
253
+ "metadata": {
254
+ "interpreter": {
255
+ "hash": "3f7e9d73c32ad96f75174922c475a50b168aad887cbaa14717912a88f31d3802"
256
+ },
257
+ "kernelspec": {
258
+ "display_name": "Python 3.9.7 ('tf-gpu')",
259
+ "language": "python",
260
+ "name": "python3"
261
+ },
262
+ "language_info": {
263
+ "codemirror_mode": {
264
+ "name": "ipython",
265
+ "version": 3
266
+ },
267
+ "file_extension": ".py",
268
+ "mimetype": "text/x-python",
269
+ "name": "python",
270
+ "nbconvert_exporter": "python",
271
+ "pygments_lexer": "ipython3",
272
+ "version": "3.9.7"
273
+ },
274
+ "orig_nbformat": 4
275
+ },
276
+ "nbformat": 4,
277
+ "nbformat_minor": 2
278
+ }
PDF_a_TXT.ipynb ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Requirement already satisfied: PyPDF2 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (1.26.0)\n",
13
+ "Requirement already satisfied: tika in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (1.24)\n",
14
+ "Requirement already satisfied: requests in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from tika) (2.27.1)\n",
15
+ "Requirement already satisfied: setuptools in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from tika) (58.0.4)\n",
16
+ "Requirement already satisfied: charset-normalizer~=2.0.0 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (2.0.4)\n",
17
+ "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (2021.10.8)\n",
18
+ "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (3.3)\n",
19
+ "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (1.26.8)\n"
20
+ ]
21
+ }
22
+ ],
23
+ "source": [
24
+ "! pip install PyPDF2\n",
25
+ "! pip install tika"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 1,
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "from tika import parser\n",
35
+ "import codecs\n",
36
+ "import os"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 2,
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "def obtener_texto(file_path,store_path):\n",
46
+ " file_data = parser.from_file(file_path)\n",
47
+ " output = file_data['content']\n",
48
+ " output = output.strip() \n",
49
+ " output= output.split('\\n')\n",
50
+ " with codecs.open(store_path+'.txt', 'w','utf-8') as the_file: \n",
51
+ " for line in output:\n",
52
+ " #print(line)\n",
53
+ " if len(line)>4: \n",
54
+ " the_file.write(str(line)+'\\n')\n"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 3,
60
+ "metadata": {},
61
+ "outputs": [
62
+ {
63
+ "name": "stderr",
64
+ "output_type": "stream",
65
+ "text": [
66
+ "2022-03-17 17:02:20,018 [MainThread ] [WARNI] Failed to see startup log message; retrying...\n"
67
+ ]
68
+ }
69
+ ],
70
+ "source": [
71
+ "PDF_PATH = 'PDF'\n",
72
+ "TXT_PATH= 'TXT'\n",
73
+ "files = os.listdir(PDF_PATH)\n",
74
+ "for file in files:\n",
75
+ " obtener_texto(PDF_PATH+'/'+file,TXT_PATH+'/'+file)\n",
76
+ " "
77
+ ]
78
+ }
79
+ ],
80
+ "metadata": {
81
+ "interpreter": {
82
+ "hash": "3f7e9d73c32ad96f75174922c475a50b168aad887cbaa14717912a88f31d3802"
83
+ },
84
+ "kernelspec": {
85
+ "display_name": "Python 3.9.7 ('tf-gpu')",
86
+ "language": "python",
87
+ "name": "python3"
88
+ },
89
+ "language_info": {
90
+ "codemirror_mode": {
91
+ "name": "ipython",
92
+ "version": 3
93
+ },
94
+ "file_extension": ".py",
95
+ "mimetype": "text/x-python",
96
+ "name": "python",
97
+ "nbconvert_exporter": "python",
98
+ "pygments_lexer": "ipython3",
99
+ "version": "3.9.7"
100
+ },
101
+ "orig_nbformat": 4
102
+ },
103
+ "nbformat": 4,
104
+ "nbformat_minor": 2
105
+ }
models/124M/checkpoint ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd1b025d2e155283f5e300ce95bf6d5b6bc0f7fe010db73daa6975eb896ab9cb
3
+ size 77
models/124M/encoder.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783
3
+ size 1042301
models/124M/hparams.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9d56e4121c427164e0c55c6f03c08e1daf9002b9b672825112d19097b680318
3
+ size 90
models/124M/model.ckpt.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2060c885360cc0cf41d7a6dbc4d24b5127aae20260c8b5ae521b5a6578407118
3
+ size 497759232
models/124M/model.ckpt.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71916f763f9746f9b2a06b12d91996cf1084ae008d0424543d39391c5f2dc687
3
+ size 5215
models/124M/model.ckpt.meta ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4668c448fa11531fd6700460487f73e82d3272960cea942252f8744bf225c77b
3
+ size 471155
models/124M/vocab.bpe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5
3
+ size 456318
txt_to_csv.ipynb ADDED
@@ -0,0 +1,662 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import os\n",
11
+ "from easynmt import EasyNMT\n"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": null,
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "URL_BASE = \"https://arxiv.org/pdf/\"\n",
21
+ "PDF_PATH = 'PDF'\n",
22
+ "TXT_PATH= 'TXT'\n",
23
+ "CSV_PATH = 'CSV'"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "markdown",
28
+ "metadata": {},
29
+ "source": [
30
+ "# Get Data from TXT"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "data = pd.read_json('ARxiv/arxiv-metadata-oai-snapshot.json',lines=True, chunksize=2001,dtype={'id':'str'})\n",
40
+ "df = None\n",
41
+ "for i in data:\n",
42
+ " df = i \n",
43
+ " print(type(i))\n",
44
+ " break\n",
45
+ "df = df[['id','title','abstract']]\n"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "for file in df['id']:\n",
55
+ " file_path = TXT_PATH+'/'+str(file)+'.pdf.txt'\n",
56
+ " if os.path.isfile(file_path):\n",
57
+ " with open(file_path,'r',encoding='utf8') as f:\n",
58
+ " s =str( f.read()) \n",
59
+ " df.loc[df['id'] == str(file),'full_text'] = s "
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": null,
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "df = df.dropna()\n",
69
+ "df.reset_index()\n",
70
+ "df.to_csv(CSV_PATH+'/scientific_paper_en.csv',index=False,encoding='utf-8')\n",
71
+ "df"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "markdown",
76
+ "metadata": {},
77
+ "source": [
78
+ "# first run \n"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": null,
84
+ "metadata": {},
85
+ "outputs": [],
86
+ "source": [
87
+ "df = pd.read_csv(CSV_PATH +'/scientific_paper_en.csv',dtype={'id':'str'})\n",
88
+ "df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "markdown",
93
+ "metadata": {},
94
+ "source": [
95
+ "# leer datos"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
+ "df = pd.read_csv(CSV_PATH +'/scientific_paper_full_text_translated.csv',dtype={'id':'str'})\n",
105
+ "print(len(df.index))\n",
106
+ "df"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "markdown",
111
+ "metadata": {},
112
+ "source": [
113
+ "# translate"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": null,
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": [
122
+ "model = EasyNMT('opus-mt')"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "markdown",
127
+ "metadata": {},
128
+ "source": [
129
+ "## translate full text"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": null,
135
+ "metadata": {},
136
+ "outputs": [],
137
+ "source": [
138
+ "max = len(df.index)\n",
139
+ "for i in range(0,1754):\n",
140
+ " text = df.iloc[i]['full_text']\n",
141
+ " translated_text = model.translate(text, target_lang='es')\n",
142
+ " df.loc[i,'translated'] = translated_text \n",
143
+ " print(\"listo documento \",i)\n",
144
+ " if(i%10==0):\n",
145
+ " df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
146
+ "df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "markdown",
151
+ "metadata": {},
152
+ "source": [
153
+ "## translate abstract"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": null,
159
+ "metadata": {},
160
+ "outputs": [],
161
+ "source": [
162
+ "max = len(df.index)\n",
163
+ "for i in range(0,1754):\n",
164
+ " text = df.iloc[i]['abstract']\n",
165
+ " translated_text = model.translate(text, target_lang='es')\n",
166
+ " df.loc[i,'translated_abstract'] = translated_text \n",
167
+ " print(\"listo documento \",i)\n",
168
+ " if(i%100==0):\n",
169
+ " df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
170
+ "df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
171
+ "\n"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "markdown",
176
+ "metadata": {},
177
+ "source": [
178
+ "# remove abstract"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": null,
184
+ "metadata": {},
185
+ "outputs": [],
186
+ "source": [
187
+ "max = len(df.index)-1"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": null,
193
+ "metadata": {},
194
+ "outputs": [],
195
+ "source": [
196
+ "end = 'Introducción'\n",
197
+ "for i in range(0,max):\n",
198
+ " text = df.iloc[i]['translated'] \n",
199
+ " p = text.find(end)\n",
200
+ " if(p != -1): \n",
201
+ " df.loc[i,'translated_no_abstract'] = text[p:] \n",
202
+ " else:\n",
203
+ " df.loc[i,'translated_no_abstract']= text\n",
204
+ " print(\"listo documento \",i,p)\n",
205
+ " if(i%1000==0):\n",
206
+ " df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
207
+ "df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": null,
213
+ "metadata": {},
214
+ "outputs": [],
215
+ "source": [
216
+ "end = 'Abstract'\n",
217
+ "for i in range(0,max):\n",
218
+ " text = df.iloc[i]['full_text'] \n",
219
+ " p = text.find(end)\n",
220
+ " if(p != -1): \n",
221
+ " df.loc[i,'text_no_abstract'] = text[p:] \n",
222
+ " else:\n",
223
+ " df.loc[i,'text_no_abstract']= text \n",
224
+ " if(i%1000==0):\n",
225
+ " df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
226
+ "df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')"
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "markdown",
231
+ "metadata": {},
232
+ "source": [
233
+ "# split data to csv"
234
+ ]
235
+ },
236
+ {
237
+ "cell_type": "code",
238
+ "execution_count": null,
239
+ "metadata": {},
240
+ "outputs": [],
241
+ "source": [
242
+ "df = pd.read_csv(CSV_PATH +'/scientific_paper_full_text_translated.csv',dtype={'id':'str'})\n",
243
+ "df"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": 77,
249
+ "metadata": {},
250
+ "outputs": [
251
+ {
252
+ "data": {
253
+ "text/html": [
254
+ "<div>\n",
255
+ "<style scoped>\n",
256
+ " .dataframe tbody tr th:only-of-type {\n",
257
+ " vertical-align: middle;\n",
258
+ " }\n",
259
+ "\n",
260
+ " .dataframe tbody tr th {\n",
261
+ " vertical-align: top;\n",
262
+ " }\n",
263
+ "\n",
264
+ " .dataframe thead th {\n",
265
+ " text-align: right;\n",
266
+ " }\n",
267
+ "</style>\n",
268
+ "<table border=\"1\" class=\"dataframe\">\n",
269
+ " <thead>\n",
270
+ " <tr style=\"text-align: right;\">\n",
271
+ " <th></th>\n",
272
+ " <th>id</th>\n",
273
+ " <th>title</th>\n",
274
+ " <th>full_text</th>\n",
275
+ " <th>abstract</th>\n",
276
+ " <th>text_no_abstract</th>\n",
277
+ " </tr>\n",
278
+ " </thead>\n",
279
+ " <tbody>\n",
280
+ " <tr>\n",
281
+ " <th>0</th>\n",
282
+ " <td>0704.0002</td>\n",
283
+ " <td>Sparsity-certifying Graph Decompositions</td>\n",
284
+ " <td>Descomposiciones del gráfico de certificación ...</td>\n",
285
+ " <td>Describimos un nuevo algoritmo, el juego de ...</td>\n",
286
+ " <td>Introducción y preliminares\\nEl foco de este d...</td>\n",
287
+ " </tr>\n",
288
+ " <tr>\n",
289
+ " <th>1</th>\n",
290
+ " <td>0704.0003</td>\n",
291
+ " <td>The evolution of the Earth-Moon system based o...</td>\n",
292
+ " <td>La evolución del sistema Tierra-Luna basado en...</td>\n",
293
+ " <td>La evolución del sistema Tierra-Luna es desc...</td>\n",
294
+ " <td>Introducción \\nLa teoría aceptada popularmente...</td>\n",
295
+ " </tr>\n",
296
+ " <tr>\n",
297
+ " <th>2</th>\n",
298
+ " <td>0704.0004</td>\n",
299
+ " <td>A determinant of Stirling cycle numbers counts...</td>\n",
300
+ " <td>Un determinante de los números de ciclo de Sti...</td>\n",
301
+ " <td>Demostramos que un determinante de los númer...</td>\n",
302
+ " <td>Introducción El propósito principal de este ar...</td>\n",
303
+ " </tr>\n",
304
+ " <tr>\n",
305
+ " <th>3</th>\n",
306
+ " <td>0704.0005</td>\n",
307
+ " <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
308
+ " <td>DE DÍA A DÍA\\nWAEL ABU-SHAMMALA Y ALBERTO TORC...</td>\n",
309
+ " <td>En este artículo mostramos cómo calcular la ...</td>\n",
310
+ " <td>DE DÍA A DÍA\\nWAEL ABU-SHAMMALA Y ALBERTO TORC...</td>\n",
311
+ " </tr>\n",
312
+ " <tr>\n",
313
+ " <th>4</th>\n",
314
+ " <td>0704.0007</td>\n",
315
+ " <td>Polymer Quantum Mechanics and its Continuum Limit</td>\n",
316
+ " <td>La mecánica cuántica de polímeros y su límite ...</td>\n",
317
+ " <td>Una representación cuántica no estándar de l...</td>\n",
318
+ " <td>La mecánica cuántica de polímeros y su límite ...</td>\n",
319
+ " </tr>\n",
320
+ " <tr>\n",
321
+ " <th>...</th>\n",
322
+ " <td>...</td>\n",
323
+ " <td>...</td>\n",
324
+ " <td>...</td>\n",
325
+ " <td>...</td>\n",
326
+ " <td>...</td>\n",
327
+ " </tr>\n",
328
+ " <tr>\n",
329
+ " <th>1749</th>\n",
330
+ " <td>0704.1996</td>\n",
331
+ " <td>A Wave-function for Stringy Universes</td>\n",
332
+ " <td>LPTENS–07/16\\nAbril de 2007\\nUna función de on...</td>\n",
333
+ " <td>Definimos una función de onda para los fondo...</td>\n",
334
+ " <td>Introducción\\nNuestro objetivo en este documen...</td>\n",
335
+ " </tr>\n",
336
+ " <tr>\n",
337
+ " <th>1750</th>\n",
338
+ " <td>0704.1997</td>\n",
339
+ " <td>Query on Negative Temperature, Internal Intera...</td>\n",
340
+ " <td>Microsoft Word - negEntr.doc\\nConsulta sobre t...</td>\n",
341
+ " <td>Después de que la temperatura negativa se vu...</td>\n",
342
+ " <td>Microsoft Word - negEntr.doc\\nConsulta sobre t...</td>\n",
343
+ " </tr>\n",
344
+ " <tr>\n",
345
+ " <th>1751</th>\n",
346
+ " <td>0704.1998</td>\n",
347
+ " <td>Absence of the Fifth Force Problem in a Model ...</td>\n",
348
+ " <td>Ausencia del problema de la quinta fuerza en u...</td>\n",
349
+ " <td>Un modelo de escala invariante que contiene ...</td>\n",
350
+ " <td>Introducción\\n\\tBase de Dos Medidas Teoría de ...</td>\n",
351
+ " </tr>\n",
352
+ " <tr>\n",
353
+ " <th>1752</th>\n",
354
+ " <td>0704.1999</td>\n",
355
+ " <td>Dark matter caustics and the enhancement of se...</td>\n",
356
+ " <td>Proyecto de versión 16 de noviembre de 2018\\nT...</td>\n",
357
+ " <td>Los haloes fríos de materia oscura están pob...</td>\n",
358
+ " <td>Proyecto de versión 16 de noviembre de 2018\\nT...</td>\n",
359
+ " </tr>\n",
360
+ " <tr>\n",
361
+ " <th>1753</th>\n",
362
+ " <td>0704.2000</td>\n",
363
+ " <td>Search for a Higgs boson produced in associati...</td>\n",
364
+ " <td>FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi...</td>\n",
365
+ " <td>Describimos una búsqueda para el modelo está...</td>\n",
366
+ " <td>FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi...</td>\n",
367
+ " </tr>\n",
368
+ " </tbody>\n",
369
+ "</table>\n",
370
+ "<p>1754 rows × 5 columns</p>\n",
371
+ "</div>"
372
+ ],
373
+ "text/plain": [
374
+ " id title \\\n",
375
+ "0 0704.0002 Sparsity-certifying Graph Decompositions \n",
376
+ "1 0704.0003 The evolution of the Earth-Moon system based o... \n",
377
+ "2 0704.0004 A determinant of Stirling cycle numbers counts... \n",
378
+ "3 0704.0005 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
379
+ "4 0704.0007 Polymer Quantum Mechanics and its Continuum Limit \n",
380
+ "... ... ... \n",
381
+ "1749 0704.1996 A Wave-function for Stringy Universes \n",
382
+ "1750 0704.1997 Query on Negative Temperature, Internal Intera... \n",
383
+ "1751 0704.1998 Absence of the Fifth Force Problem in a Model ... \n",
384
+ "1752 0704.1999 Dark matter caustics and the enhancement of se... \n",
385
+ "1753 0704.2000 Search for a Higgs boson produced in associati... \n",
386
+ "\n",
387
+ " full_text \\\n",
388
+ "0 Descomposiciones del gráfico de certificación ... \n",
389
+ "1 La evolución del sistema Tierra-Luna basado en... \n",
390
+ "2 Un determinante de los números de ciclo de Sti... \n",
391
+ "3 DE DÍA A DÍA\\nWAEL ABU-SHAMMALA Y ALBERTO TORC... \n",
392
+ "4 La mecánica cuántica de polímeros y su límite ... \n",
393
+ "... ... \n",
394
+ "1749 LPTENS–07/16\\nAbril de 2007\\nUna función de on... \n",
395
+ "1750 Microsoft Word - negEntr.doc\\nConsulta sobre t... \n",
396
+ "1751 Ausencia del problema de la quinta fuerza en u... \n",
397
+ "1752 Proyecto de versión 16 de noviembre de 2018\\nT... \n",
398
+ "1753 FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi... \n",
399
+ "\n",
400
+ " abstract \\\n",
401
+ "0 Describimos un nuevo algoritmo, el juego de ... \n",
402
+ "1 La evolución del sistema Tierra-Luna es desc... \n",
403
+ "2 Demostramos que un determinante de los númer... \n",
404
+ "3 En este artículo mostramos cómo calcular la ... \n",
405
+ "4 Una representación cuántica no estándar de l... \n",
406
+ "... ... \n",
407
+ "1749 Definimos una función de onda para los fondo... \n",
408
+ "1750 Después de que la temperatura negativa se vu... \n",
409
+ "1751 Un modelo de escala invariante que contiene ... \n",
410
+ "1752 Los haloes fríos de materia oscura están pob... \n",
411
+ "1753 Describimos una búsqueda para el modelo está... \n",
412
+ "\n",
413
+ " text_no_abstract \n",
414
+ "0 Introducción y preliminares\\nEl foco de este d... \n",
415
+ "1 Introducción \\nLa teoría aceptada popularmente... \n",
416
+ "2 Introducción El propósito principal de este ar... \n",
417
+ "3 DE DÍA A DÍA\\nWAEL ABU-SHAMMALA Y ALBERTO TORC... \n",
418
+ "4 La mecánica cuántica de polímeros y su límite ... \n",
419
+ "... ... \n",
420
+ "1749 Introducción\\nNuestro objetivo en este documen... \n",
421
+ "1750 Microsoft Word - negEntr.doc\\nConsulta sobre t... \n",
422
+ "1751 Introducción\\n\\tBase de Dos Medidas Teoría de ... \n",
423
+ "1752 Proyecto de versión 16 de noviembre de 2018\\nT... \n",
424
+ "1753 FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi... \n",
425
+ "\n",
426
+ "[1754 rows x 5 columns]"
427
+ ]
428
+ },
429
+ "execution_count": 77,
430
+ "metadata": {},
431
+ "output_type": "execute_result"
432
+ }
433
+ ],
434
+ "source": [
435
+ "es = df[['id','title','translated','translated_abstract','translated_no_abstract']]\n",
436
+ "es.columns = [\"id\",\"title\", \"full_text\",\"abstract\",\"text_no_abstract\"]\n",
437
+ "es.to_csv(CSV_PATH+'/scientific_paper_es.csv',index=False,encoding='utf-8')\n",
438
+ "es"
439
+ ]
440
+ },
441
+ {
442
+ "cell_type": "code",
443
+ "execution_count": 79,
444
+ "metadata": {},
445
+ "outputs": [
446
+ {
447
+ "data": {
448
+ "text/html": [
449
+ "<div>\n",
450
+ "<style scoped>\n",
451
+ " .dataframe tbody tr th:only-of-type {\n",
452
+ " vertical-align: middle;\n",
453
+ " }\n",
454
+ "\n",
455
+ " .dataframe tbody tr th {\n",
456
+ " vertical-align: top;\n",
457
+ " }\n",
458
+ "\n",
459
+ " .dataframe thead th {\n",
460
+ " text-align: right;\n",
461
+ " }\n",
462
+ "</style>\n",
463
+ "<table border=\"1\" class=\"dataframe\">\n",
464
+ " <thead>\n",
465
+ " <tr style=\"text-align: right;\">\n",
466
+ " <th></th>\n",
467
+ " <th>id</th>\n",
468
+ " <th>title</th>\n",
469
+ " <th>full_text</th>\n",
470
+ " <th>abstract</th>\n",
471
+ " <th>text_no_abstract</th>\n",
472
+ " </tr>\n",
473
+ " </thead>\n",
474
+ " <tbody>\n",
475
+ " <tr>\n",
476
+ " <th>0</th>\n",
477
+ " <td>0704.0002</td>\n",
478
+ " <td>Sparsity-certifying Graph Decompositions</td>\n",
479
+ " <td>Sparsity-certifying Graph Decompositions\\nIlea...</td>\n",
480
+ " <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
481
+ " <td>Introduction and preliminaries\\nThe focus of t...</td>\n",
482
+ " </tr>\n",
483
+ " <tr>\n",
484
+ " <th>1</th>\n",
485
+ " <td>0704.0003</td>\n",
486
+ " <td>The evolution of the Earth-Moon system based o...</td>\n",
487
+ " <td>The evolution of the Earth-Moon system based o...</td>\n",
488
+ " <td>The evolution of Earth-Moon system is descri...</td>\n",
489
+ " <td>Introduction \\nThe popularly accepted theory f...</td>\n",
490
+ " </tr>\n",
491
+ " <tr>\n",
492
+ " <th>2</th>\n",
493
+ " <td>0704.0004</td>\n",
494
+ " <td>A determinant of Stirling cycle numbers counts...</td>\n",
495
+ " <td>A Determinant of Stirling Cycle Numbers Counts...</td>\n",
496
+ " <td>We show that a determinant of Stirling cycle...</td>\n",
497
+ " <td>Introduction The chief purpose of this paper i...</td>\n",
498
+ " </tr>\n",
499
+ " <tr>\n",
500
+ " <th>3</th>\n",
501
+ " <td>0704.0005</td>\n",
502
+ " <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
503
+ " <td>FROM DYADIC Λα TO Λα\\nWAEL ABU-SHAMMALA AND AL...</td>\n",
504
+ " <td>In this paper we show how to compute the $\\L...</td>\n",
505
+ " <td>FROM DYADIC Λα TO Λα\\nWAEL ABU-SHAMMALA AND AL...</td>\n",
506
+ " </tr>\n",
507
+ " <tr>\n",
508
+ " <th>4</th>\n",
509
+ " <td>0704.0007</td>\n",
510
+ " <td>Polymer Quantum Mechanics and its Continuum Limit</td>\n",
511
+ " <td>Polymer Quantum Mechanics and its Continuum Li...</td>\n",
512
+ " <td>A rather non-standard quantum representation...</td>\n",
513
+ " <td>Polymer Quantum Mechanics and its Continuum Li...</td>\n",
514
+ " </tr>\n",
515
+ " <tr>\n",
516
+ " <th>...</th>\n",
517
+ " <td>...</td>\n",
518
+ " <td>...</td>\n",
519
+ " <td>...</td>\n",
520
+ " <td>...</td>\n",
521
+ " <td>...</td>\n",
522
+ " </tr>\n",
523
+ " <tr>\n",
524
+ " <th>1749</th>\n",
525
+ " <td>0704.1996</td>\n",
526
+ " <td>A Wave-function for Stringy Universes</td>\n",
527
+ " <td>LPTENS–07/16\\nApril 2007\\nA Wave-function for ...</td>\n",
528
+ " <td>We define a wave-function for string theory ...</td>\n",
529
+ " <td>Introduction\\nOur goal in this paper is to emb...</td>\n",
530
+ " </tr>\n",
531
+ " <tr>\n",
532
+ " <th>1750</th>\n",
533
+ " <td>0704.1997</td>\n",
534
+ " <td>Query on Negative Temperature, Internal Intera...</td>\n",
535
+ " <td>Microsoft Word - negEntr.doc\\nQuery on Negativ...</td>\n",
536
+ " <td>After negative temperature is restated, we f...</td>\n",
537
+ " <td>Microsoft Word - negEntr.doc\\nQuery on Negativ...</td>\n",
538
+ " </tr>\n",
539
+ " <tr>\n",
540
+ " <th>1751</th>\n",
541
+ " <td>0704.1998</td>\n",
542
+ " <td>Absence of the Fifth Force Problem in a Model ...</td>\n",
543
+ " <td>Absence of the Fifth Force Problem in a Model ...</td>\n",
544
+ " <td>A scale invariant model containing dilaton $...</td>\n",
545
+ " <td>Introduction\\n\\tBasis of Two Measures Field Th...</td>\n",
546
+ " </tr>\n",
547
+ " <tr>\n",
548
+ " <th>1752</th>\n",
549
+ " <td>0704.1999</td>\n",
550
+ " <td>Dark matter caustics and the enhancement of se...</td>\n",
551
+ " <td>Draft version November 16, 2018\\nPreprint type...</td>\n",
552
+ " <td>Cold dark matter haloes are populated by cau...</td>\n",
553
+ " <td>Draft version November 16, 2018\\nPreprint type...</td>\n",
554
+ " </tr>\n",
555
+ " <tr>\n",
556
+ " <th>1753</th>\n",
557
+ " <td>0704.2000</td>\n",
558
+ " <td>Search for a Higgs boson produced in associati...</td>\n",
559
+ " <td>FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso...</td>\n",
560
+ " <td>We describe a search for the standard model ...</td>\n",
561
+ " <td>FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso...</td>\n",
562
+ " </tr>\n",
563
+ " </tbody>\n",
564
+ "</table>\n",
565
+ "<p>1754 rows × 5 columns</p>\n",
566
+ "</div>"
567
+ ],
568
+ "text/plain": [
569
+ " id title \\\n",
570
+ "0 0704.0002 Sparsity-certifying Graph Decompositions \n",
571
+ "1 0704.0003 The evolution of the Earth-Moon system based o... \n",
572
+ "2 0704.0004 A determinant of Stirling cycle numbers counts... \n",
573
+ "3 0704.0005 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
574
+ "4 0704.0007 Polymer Quantum Mechanics and its Continuum Limit \n",
575
+ "... ... ... \n",
576
+ "1749 0704.1996 A Wave-function for Stringy Universes \n",
577
+ "1750 0704.1997 Query on Negative Temperature, Internal Intera... \n",
578
+ "1751 0704.1998 Absence of the Fifth Force Problem in a Model ... \n",
579
+ "1752 0704.1999 Dark matter caustics and the enhancement of se... \n",
580
+ "1753 0704.2000 Search for a Higgs boson produced in associati... \n",
581
+ "\n",
582
+ " full_text \\\n",
583
+ "0 Sparsity-certifying Graph Decompositions\\nIlea... \n",
584
+ "1 The evolution of the Earth-Moon system based o... \n",
585
+ "2 A Determinant of Stirling Cycle Numbers Counts... \n",
586
+ "3 FROM DYADIC Λα TO Λα\\nWAEL ABU-SHAMMALA AND AL... \n",
587
+ "4 Polymer Quantum Mechanics and its Continuum Li... \n",
588
+ "... ... \n",
589
+ "1749 LPTENS–07/16\\nApril 2007\\nA Wave-function for ... \n",
590
+ "1750 Microsoft Word - negEntr.doc\\nQuery on Negativ... \n",
591
+ "1751 Absence of the Fifth Force Problem in a Model ... \n",
592
+ "1752 Draft version November 16, 2018\\nPreprint type... \n",
593
+ "1753 FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso... \n",
594
+ "\n",
595
+ " abstract \\\n",
596
+ "0 We describe a new algorithm, the $(k,\\ell)$-... \n",
597
+ "1 The evolution of Earth-Moon system is descri... \n",
598
+ "2 We show that a determinant of Stirling cycle... \n",
599
+ "3 In this paper we show how to compute the $\\L... \n",
600
+ "4 A rather non-standard quantum representation... \n",
601
+ "... ... \n",
602
+ "1749 We define a wave-function for string theory ... \n",
603
+ "1750 After negative temperature is restated, we f... \n",
604
+ "1751 A scale invariant model containing dilaton $... \n",
605
+ "1752 Cold dark matter haloes are populated by cau... \n",
606
+ "1753 We describe a search for the standard model ... \n",
607
+ "\n",
608
+ " text_no_abstract \n",
609
+ "0 Introduction and preliminaries\\nThe focus of t... \n",
610
+ "1 Introduction \\nThe popularly accepted theory f... \n",
611
+ "2 Introduction The chief purpose of this paper i... \n",
612
+ "3 FROM DYADIC Λα TO Λα\\nWAEL ABU-SHAMMALA AND AL... \n",
613
+ "4 Polymer Quantum Mechanics and its Continuum Li... \n",
614
+ "... ... \n",
615
+ "1749 Introduction\\nOur goal in this paper is to emb... \n",
616
+ "1750 Microsoft Word - negEntr.doc\\nQuery on Negativ... \n",
617
+ "1751 Introduction\\n\\tBasis of Two Measures Field Th... \n",
618
+ "1752 Draft version November 16, 2018\\nPreprint type... \n",
619
+ "1753 FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso... \n",
620
+ "\n",
621
+ "[1754 rows x 5 columns]"
622
+ ]
623
+ },
624
+ "execution_count": 79,
625
+ "metadata": {},
626
+ "output_type": "execute_result"
627
+ }
628
+ ],
629
+ "source": [
630
+ "en = df[['id','title','full_text','abstract','text_no_abstract']]\n",
631
+ "en.columns = [\"id\",\"title\", \"full_text\",\"abstract\",\"text_no_abstract\"]\n",
632
+ "en.to_csv(CSV_PATH+'/scientific_paper_en.csv',index=False,encoding='utf-8')\n",
633
+ "en"
634
+ ]
635
+ }
636
+ ],
637
+ "metadata": {
638
+ "interpreter": {
639
+ "hash": "05def4d9d0834781cbeb6b95fd92421f8bd6a45e945308f90d88567f4afc1911"
640
+ },
641
+ "kernelspec": {
642
+ "display_name": "Python 3.8.12 ('tensorflow')",
643
+ "language": "python",
644
+ "name": "python3"
645
+ },
646
+ "language_info": {
647
+ "codemirror_mode": {
648
+ "name": "ipython",
649
+ "version": 3
650
+ },
651
+ "file_extension": ".py",
652
+ "mimetype": "text/x-python",
653
+ "name": "python",
654
+ "nbconvert_exporter": "python",
655
+ "pygments_lexer": "ipython3",
656
+ "version": "3.9.7"
657
+ },
658
+ "orig_nbformat": 4
659
+ },
660
+ "nbformat": 4,
661
+ "nbformat_minor": 2
662
+ }