From 08fe57bf453f187eca60b0dd8136c455d5fe9979 Mon Sep 17 00:00:00 2001 From: "kouek.jb" Date: Wed, 17 Mar 2021 13:03:58 +0200 Subject: [PATCH 1/2] Guided Scraping --- 1Introduction/Part3_Simple_Web_Scraping.ipynb | 176 +++++++++++++++++- .../Part4_Web_Scraping_WebDriver.ipynb | 23 ++- 2 files changed, 194 insertions(+), 5 deletions(-) diff --git a/1Introduction/Part3_Simple_Web_Scraping.ipynb b/1Introduction/Part3_Simple_Web_Scraping.ipynb index a2e317fd..81c0570a 100644 --- a/1Introduction/Part3_Simple_Web_Scraping.ipynb +++ b/1Introduction/Part3_Simple_Web_Scraping.ipynb @@ -232,6 +232,61 @@ "- Récupérer le domaine en fonction d'un url" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import random\n", + "class Scrapper:\n", + " def __init__(self, url):\n", + " # Définition du User Agent\n", + " self.headers = \n", + " self.url = url\n", + " \n", + " def get_response(self, timeout=, max_retry=3):\n", + " \"\"\"\n", + " 1. Spécifier une valeur de timeout dans les paramètres de fonctions\n", + " 2. Utliser requests afin de procéder à la requeête en utilisant l'url du site web en question,\n", + " le User Agent, et le time_out\n", + " \"\"\"\n", + " lastException = None \n", + " for _ in range(max_retry):\n", + " try:\n", + " response = \n", + " return response.text\n", + " except Exception as e:\n", + " lastException = e\n", + " raise lastException\n", + " \n", + " \n", + " def remove_white_spaces(self, x):\n", + " \"\"\" Fonction qui enlève les espaces d'une string\n", + " Exemple ' hello ' > 'hello'\n", + " \"\"\"\n", + " return \n", + " \n", + " def clean_html_string(self, raw_html):\n", + " \"\"\"Fonction enlève les caractères spéciaux d'une string\n", + " Exemple : '<> Ca va ?!' > 'Ca va' \n", + " \"\"\"\n", + " return \n", + " \n", + " def extract_domain_name(self):\n", + " \"\"\" Fonction qui extrait le nom de domaine\n", + " Exemple 'http://www.lemonde.fr/accueil' > 'www.lemonde.fr'\n", + " \"\"\"\n", + " return \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " " + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -498,8 +553,95 @@ "- Ajouter une méthode pour récupérer l'objet soup d'un url\n", "- Récupérer une liste de User Agent et effectuer une rotation aléatoire sur celui à utiliser\n", "- Utiliser cette classe pour parser une page HTML et récupérer : le titre, tous les H1 (si ils existent), les liens vers les images, les liens sortants vers d'autres sites, et le texte principal.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import random\n", + "class Scrapper:\n", + " def __init__(self, url):\n", + " # \n", + "\n", + " def __init__(self, url):\n", + " \"\"\" 1. Créer une liste de plusieurs User Agents (Un max de 5 est suffisant pour l'exemple)\n", + " 2. effectuer la rotation pour en choisir un aléatoirement à chaque requête\n", + " \"\"\"\n", + " self.user_agent_list = []\n", + " self.headers = \n", + " self.url = url\n", + " \n", + " \n", + " ########\n", + " # RAJOUTER les fonctions définies dans les Excercises 1 et 2 ci dessous\n", + " ###### \n", + "\n", + " \n", + " def get_soup(self, text_response):\n", + " \"\"\"Parser la page html en utilisant BeautifulSoup pour accèder facilement aux balises et leurs contenu\"\"\"\n", + " return \n", + " \n", + " \n", + " def get_title(self, soup):\n", + " \"\"\"\n", + " Extraire le tire de la page\"\"\"\n", + " return \n", + " \n", + " def get_all_h1(self, soup):\n", + " \"\"\"\n", + " Extraire toutes les balises h1 de la page\"\"\"\n", + " return \n", + " \n", + " def get_imagelinks(self, soup):\n", + " \"\"\" Extraire les images visibles dans l'url\"\"\"\n", + " return \n", + " \n", + " def get_main_text(self, soup):\n", + " \"\"\"\n", + " Extraire le texte principal sur la page\"\"\"\n", + " return\n", + " \n", + " def get_out_links(self, soup):\n", + " \"\"\" Extraire tous les liens sortants\"\"\"\n", + " return \n", + " \n", "\n", - "Parsing d'un sitemaps pour récupérer une listes de liens avec les informations disponibles. -> Stocker dans un dictionnaire et dans un fichier JSON local." + " \n", + " def main(self):\n", + " \"\"\"\n", + " 1. Appeler la méthode qui va requeter le site en question avec les bons paramètres\n", + " 2. Appeler la méthode pour la création de l'objet soup avec les bons paramètres\n", + " 3. Retourner l'objet résultat qui contient les différents éléments de la page\n", + " ( en utilisant toutes les fonctions développées dans la classe ci-dessus)\n", + " \"\"\"\n", + " response = \n", + " soup = \n", + " resultat = {\n", + " 'domain_name': ,\n", + " 'title':, \n", + " 'url': , \n", + " 'images': , \n", + " 'h2': \n", + " \n", + " }\n", + " return resultat\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 1. Instancier la classe scrapper avec l'url à scrapper\n", + "s = \n", + "# 2. Executer la méthode principale de la classe afin d'obtenir les résultats voulus\n" ] }, { @@ -538,6 +680,36 @@ "- Fouiller dans les requêtes" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "# 1. Définir un User Agent\n", + "headers = \n", + "\n", + "def get_qwant_results(query):\n", + " # Trouver la bonne url de l'API\n", + " url = \n", + " # Requeter cette URL\n", + " response = \n", + " # Retourner un objet JSON et accèdant à la partie qui nous intéresse dans le 'result'\n", + " results = response.json()\n", + " return " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Appliquer la méthode ci dessus sur le sujet de votre choix" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -592,7 +764,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/1Introduction/Part4_Web_Scraping_WebDriver.ipynb b/1Introduction/Part4_Web_Scraping_WebDriver.ipynb index 541c903e..d1105af0 100644 --- a/1Introduction/Part4_Web_Scraping_WebDriver.ipynb +++ b/1Introduction/Part4_Web_Scraping_WebDriver.ipynb @@ -380,11 +380,22 @@ } ], "source": [ - "def extract_price(offer_web_element):\n", + "def extract_price(offer_web_element):\"\"\"\n", + " Fonction qui retourne le prix d'un article\n", + " Exemple : Dans l'offre ci-dessous : Retourner 519\n", + " \"\"\"\n", " raise NotImplementedError('Need to be implemented')\n", " pass\n", "\n", - "extract_price(all_first_line_elements[0])" + "# extract_price(all_first_line_elements[0])\n", + "extract_price(\"\"\"\n", + "OFFRE DU JOUR\n", + "219,90 €\n", + "Prix conseillé : 519,00 € (-58%)\n", + "Braun: Épilateur Lumière Pulsée 5009\n", + "Vendu et expédié par Amazon.fr\n", + "239\n", + "Offre terminée\"\"\")" ] }, { @@ -414,6 +425,9 @@ ], "source": [ "def extract_image(offer_web_element):\n", + " \"\"\"\n", + " Fonction qui extrait l'image à partir d'une offre\n", + " \"\"\"\n", " raise NotImplementedError('Need to be implemented')\n", " pass\n", "\n", @@ -447,6 +461,9 @@ ], "source": [ "def extract_title(offer_web_element):\n", + " \"\"\"\n", + " Fonction qui extrait le titre de l'offre\n", + " \"\"\"\n", " raise NotImplementedError('Need to be implemented')\n", " pass\n", "\n", @@ -479,7 +496,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.8.5" } }, "nbformat": 4, From 7ebc0117922533c54cc693bfc1a8027b9fb86604 Mon Sep 17 00:00:00 2001 From: "kouek.jb" Date: Wed, 17 Mar 2021 13:12:35 +0200 Subject: [PATCH 2/2] Cleaning Exos Mongo --- 3Mongo/ExerciceKickStarter.ipynb | 15 ++++++++++++++- 3Mongo/ExerciceYoutube.ipynb | 28 +++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/3Mongo/ExerciceKickStarter.ipynb b/3Mongo/ExerciceKickStarter.ipynb index 0f9ab15f..07448acb 100644 --- a/3Mongo/ExerciceKickStarter.ipynb +++ b/3Mongo/ExerciceKickStarter.ipynb @@ -264,6 +264,19 @@ "### Netoyer les données" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_ks['launched'] = pd.to_datetime(df_ks['launched'], errors='coerce')\n", + "df_ks = df_ks.dropna(subset=['launched'])\n", + "df_ks.rename(columns={'ID':'_id'}, inplace=True)\n", + "\n", + "l_d =df_ks.to_dict(orient='records')" + ] + }, { "cell_type": "markdown", "metadata": { @@ -341,7 +354,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/3Mongo/ExerciceYoutube.ipynb b/3Mongo/ExerciceYoutube.ipynb index 15ed99b8..e67b857e 100644 --- a/3Mongo/ExerciceYoutube.ipynb +++ b/3Mongo/ExerciceYoutube.ipynb @@ -159,6 +159,32 @@ "### Netoyer les données" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_youtube['date'] = pd.to_datetime(df_youtube['date'], errors='coerce')\n", + "df_youtube = df_youtube.dropna(subset=['date'])\n", + "\n", + "\n", + "df_youtube.rename(columns={'video_id':'_id'}, inplace=True)\n", + "\n", + "df_youtube = df_youtube.drop_duplicates(subset='_id', keep='last')\n", + "\n", + "l_d =df_youtube.to_dict(orient='records')\n", + "\n", + "import json\n", + "with open('./data/US_category_id.json', 'r') as f:\n", + " categ = json.load(f)\n", + "\n", + "for video in l_d:\n", + " for cat in categ['items']:\n", + " if video['category_id'] == int(cat['id']):\n", + " video['category_name'] = cat['snippet']['title']" + ] + }, { "cell_type": "markdown", "metadata": { @@ -243,7 +269,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.5" } }, "nbformat": 4,