{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "from pandas import json_normalize\n", "import requests\n", "from pathlib import Path\n", "from multiprocessing.pool import ThreadPool as Pool\n", "import codecs\n", "import random\n", "import re" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "URL_BASE = \"https://arxiv.org/pdf/\"\n", "PDF_PATH = 'PDF'\n", "TXT_PATH= 'TXT'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Arxiv\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleabstractText
00704.0001Calculation of prompt diphoton production cros...A fully differential calculation in perturba...
10704.0002Sparsity-certifying Graph DecompositionsWe describe a new algorithm, the $(k,\\ell)$-...
20704.0003The evolution of the Earth-Moon system based o...The evolution of Earth-Moon system is descri...
30704.0004A determinant of Stirling cycle numbers counts...We show that a determinant of Stirling cycle...
40704.0005From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...In this paper we show how to compute the $\\L...
...............
19960704.1997Query on Negative Temperature, Internal Intera...After negative temperature is restated, we f...
19970704.1998Absence of the Fifth Force Problem in a Model ...A scale invariant model containing dilaton $...
19980704.1999Dark matter caustics and the enhancement of se...Cold dark matter haloes are populated by cau...
19990704.2000Search for a Higgs boson produced in associati...We describe a search for the standard model ...
20000704.2001Geometry of Parallelizable Manifolds in the Co...In this paper, we deal with a generalization...
\n", "

2001 rows × 4 columns

\n", "
" ], "text/plain": [ " id title \\\n", "0 0704.0001 Calculation of prompt diphoton production cros... \n", "1 0704.0002 Sparsity-certifying Graph Decompositions \n", "2 0704.0003 The evolution of the Earth-Moon system based o... \n", "3 0704.0004 A determinant of Stirling cycle numbers counts... \n", "4 0704.0005 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n", "... ... ... \n", "1996 0704.1997 Query on Negative Temperature, Internal Intera... \n", "1997 0704.1998 Absence of the Fifth Force Problem in a Model ... \n", "1998 0704.1999 Dark matter caustics and the enhancement of se... \n", "1999 0704.2000 Search for a Higgs boson produced in associati... \n", "2000 0704.2001 Geometry of Parallelizable Manifolds in the Co... \n", "\n", " abstract Text \n", "0 A fully differential calculation in perturba... \n", "1 We describe a new algorithm, the $(k,\\ell)$-... \n", "2 The evolution of Earth-Moon system is descri... \n", "3 We show that a determinant of Stirling cycle... \n", "4 In this paper we show how to compute the $\\L... \n", "... ... ... \n", "1996 After negative temperature is restated, we f... \n", "1997 A scale invariant model containing dilaton $... \n", "1998 Cold dark matter haloes are populated by cau... \n", "1999 We describe a search for the standard model ... \n", "2000 In this paper, we deal with a generalization... \n", "\n", "[2001 rows x 4 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_json('ARxiv/arxiv-metadata-oai-snapshot.json',lines=True, chunksize=2001,dtype={'id':'str'})\n", "df = None\n", "for i in data:\n", " df = i \n", " print(type(i))\n", " break\n", "df = df[['id','title','abstract']]\n", "df.insert(3, \"Text\", \"\") \n", "df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def GetFileURL(file_id):\n", " url = URL_BASE+file_id\n", " r = requests.get(url, stream=True) \n", " filename = Path(PDF_PATH+'/'+file_id+'.pdf')\n", " response = requests.get(url)\n", " filename.write_bytes(response.content)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "pool_size = 16 \n", "def worker(file):\n", " try:\n", " GetFileURL(file)\n", " except:\n", " print('error with item '+ file)\n", " try:\n", " with codecs.open(PDF_PATH+'/log.txt', 'a') as the_file: \n", " the_file.writelines(str(file)+\"\\n\")\n", " except:\n", " print('error en log '+ file)\n", "def get_ids(iteracion,batch=100): \n", " inicio = int(iteracion*batch)\n", " filesId = data[inicio :inicio + batch]['id']\n", " return filesId\n", "\n", "pool = Pool(pool_size)\n", "filesId = get_ids(19)\n", "for file in filesId:\n", " pool.apply_async(worker, (file,))\n", "\n", "pool.close()\n", "pool.join()" ] } ], "metadata": { "interpreter": { "hash": "3f7e9d73c32ad96f75174922c475a50b168aad887cbaa14717912a88f31d3802" }, "kernelspec": { "display_name": "Python 3.9.7 ('tf-gpu')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }