{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: PyPDF2 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (1.26.0)\n", "Requirement already satisfied: tika in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (1.24)\n", "Requirement already satisfied: requests in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from tika) (2.27.1)\n", "Requirement already satisfied: setuptools in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from tika) (58.0.4)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (2.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (2021.10.8)\n", "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (3.3)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (1.26.8)\n" ] } ], "source": [ "! pip install PyPDF2\n", "! pip install tika" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from tika import parser\n", "import codecs\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def obtener_texto(file_path,store_path):\n", " file_data = parser.from_file(file_path)\n", " output = file_data['content']\n", " output = output.strip() \n", " output= output.split('\\n')\n", " with codecs.open(store_path+'.txt', 'w','utf-8') as the_file: \n", " for line in output:\n", " #print(line)\n", " if len(line)>4: \n", " the_file.write(str(line)+'\\n')\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-03-17 17:02:20,018 [MainThread ] [WARNI] Failed to see startup log message; retrying...\n" ] } ], "source": [ "PDF_PATH = 'PDF'\n", "TXT_PATH= 'TXT'\n", "files = os.listdir(PDF_PATH)\n", "for file in files:\n", " obtener_texto(PDF_PATH+'/'+file,TXT_PATH+'/'+file)\n", " " ] } ], "metadata": { "interpreter": { "hash": "3f7e9d73c32ad96f75174922c475a50b168aad887cbaa14717912a88f31d3802" }, "kernelspec": { "display_name": "Python 3.9.7 ('tf-gpu')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }