File size: 3,276 Bytes
a53f410 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: PyPDF2 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (1.26.0)\n",
"Requirement already satisfied: tika in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (1.24)\n",
"Requirement already satisfied: requests in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from tika) (2.27.1)\n",
"Requirement already satisfied: setuptools in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from tika) (58.0.4)\n",
"Requirement already satisfied: charset-normalizer~=2.0.0 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (2.0.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (2021.10.8)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (3.3)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (1.26.8)\n"
]
}
],
"source": [
"! pip install PyPDF2\n",
"! pip install tika"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from tika import parser\n",
"import codecs\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def obtener_texto(file_path,store_path):\n",
" file_data = parser.from_file(file_path)\n",
" output = file_data['content']\n",
" output = output.strip() \n",
" output= output.split('\\n')\n",
" with codecs.open(store_path+'.txt', 'w','utf-8') as the_file: \n",
" for line in output:\n",
" #print(line)\n",
" if len(line)>4: \n",
" the_file.write(str(line)+'\\n')\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-03-17 17:02:20,018 [MainThread ] [WARNI] Failed to see startup log message; retrying...\n"
]
}
],
"source": [
"PDF_PATH = 'PDF'\n",
"TXT_PATH= 'TXT'\n",
"files = os.listdir(PDF_PATH)\n",
"for file in files:\n",
" obtener_texto(PDF_PATH+'/'+file,TXT_PATH+'/'+file)\n",
" "
]
}
],
"metadata": {
"interpreter": {
"hash": "3f7e9d73c32ad96f75174922c475a50b168aad887cbaa14717912a88f31d3802"
},
"kernelspec": {
"display_name": "Python 3.9.7 ('tf-gpu')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
|