diff --git a/WikiPopulationScraper/.classpath b/HTML_Tables/.classpath similarity index 100% rename from WikiPopulationScraper/.classpath rename to HTML_Tables/.classpath diff --git a/WikiPopulationScraper/.project b/HTML_Tables/.project similarity index 90% rename from WikiPopulationScraper/.project rename to HTML_Tables/.project index 68201ba..105226c 100644 --- a/WikiPopulationScraper/.project +++ b/HTML_Tables/.project @@ -1,6 +1,6 @@ - WikiPopulationScraper + HTML_Tables diff --git a/WikiPopulationScraper/.settings/org.eclipse.core.resources.prefs b/HTML_Tables/.settings/org.eclipse.core.resources.prefs similarity index 100% rename from WikiPopulationScraper/.settings/org.eclipse.core.resources.prefs rename to HTML_Tables/.settings/org.eclipse.core.resources.prefs diff --git a/WikiPopulationScraper/.settings/org.eclipse.jdt.core.prefs b/HTML_Tables/.settings/org.eclipse.jdt.core.prefs similarity index 100% rename from WikiPopulationScraper/.settings/org.eclipse.jdt.core.prefs rename to HTML_Tables/.settings/org.eclipse.jdt.core.prefs diff --git a/HTML_Tables/bin/data_extraction/HTMLDataExtraction.class b/HTML_Tables/bin/data_extraction/HTMLDataExtraction.class new file mode 100644 index 0000000..fc76dd7 Binary files /dev/null and b/HTML_Tables/bin/data_extraction/HTMLDataExtraction.class differ diff --git a/HTML_Tables/src/data_extraction/HTMLDataExtraction.java b/HTML_Tables/src/data_extraction/HTMLDataExtraction.java new file mode 100644 index 0000000..ded9878 --- /dev/null +++ b/HTML_Tables/src/data_extraction/HTMLDataExtraction.java @@ -0,0 +1,66 @@ +package data_extraction; + +import java.time.Duration; +import java.util.List; +import org.openqa.selenium.By; +import org.openqa.selenium.JavascriptExecutor; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.support.ui.ExpectedConditions; +import org.openqa.selenium.support.ui.WebDriverWait; + +public class HTMLDataExtraction { + + public static void main(String[] args) { + WebDriver driver = new ChromeDriver(); + driver.manage().window().maximize(); + + try { + WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10)); + driver.get("https://www.w3schools.com/html/html_tables.asp"); + waitForTheUser(); + + JavascriptExecutor scrollDownOne = (JavascriptExecutor) driver; + scrollDownOne.executeScript("window.scrollBy(0,200)"); + waitForTheUser(); + + // Locate the table by XPath + WebElement table = wait.until(ExpectedConditions.elementToBeClickable(By.xpath("//table[@class='ws-table-all']"))); + waitForTheUser(); + + // Get all rows from the table (skip the first row as it's the header) + List rows = table.findElements(By.tagName("tr")); + + // Loop through each row + for (int i = 1; i < rows.size(); i++) { + WebElement row = rows.get(i); + + // Get all columns (td) in the current row + List cols = row.findElements(By.tagName("td")); + + // Extract the Company, Contact and Country + if (cols.size() > 1) { + String company = cols.get(0).getText().trim(); // Company (1st column) + String contact = cols.get(1).getText().trim(); // Contact (2nd column) + String country = cols.get(2).getText().trim(); // Country (3rd column) + + // Print the extracted data + System.out.println("Company: " + company + " | Contact: " + contact + " | Country: " + country); + } + } + } catch (Exception e) { + e.printStackTrace(); + } finally { + driver.quit(); + } + } + + public static void waitForTheUser() { + try { + Thread.sleep(2000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } +} diff --git a/README.md b/README.md index f4eeb3f..88fff74 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,26 @@ -# Wikipedia Population Table Data Extraction +# HTML Data Extraction -This project demonstrates web scraping using Selenium WebDriver to extract data from an HTML table on a Wikipedia page. Specifically, it retrieves the list of countries and territories by total population, presenting a real-world example of data extraction and automation using Selenium. +This project demonstrates data extraction from an HTML table on the W3Schools website using Selenium WebDriver. The script automates the process of retrieving company, contact, and country information from the given table, showcasing how to interact with and parse web elements programmatically. ## Features -- Automates navigation to a Wikipedia page on country populations. -- Extracts data from an HTML table, including: - - **Location** (Country or territory) - - **Population** - - **Percentage of world population** - - **Date** of population data - - **Source** of the data - - **Notes** -- Processes table rows dynamically to handle updates to the table structure or content. -- Uses JavaScript for smooth scrolling to the target table. +- Automates navigation to the W3Schools HTML table page. +- Scrolls to the target table for visibility. +- Extracts the following columns from the HTML table: + - Company + - Contact + - Country +- Displays the extracted data in the console in a structured format. +- Handles table rows dynamically, skipping headers to focus on content rows. ## Prerequisites Ensure you have the following before running the project: 1. **Java Development Kit (JDK)** - Version 8 or above. 2. **Google Chrome** - Latest stable version. 3. **ChromeDriver** - Version compatible with your Chrome browser. -4. **Selenium WebDriver** - Included in the project dependencies. +4. **Selenium WebDriver** - Included in the project dependencies. ## Technologies Used - **Java** - The programming language for the project. - **Selenium WebDriver** - For web element interaction and automation. - **Google Chrome & ChromeDriver** - For browser-based automation. -- **JavaScript Executor** - For advanced browser interactions like scrolling. +- **JavaScript Executor** - For smooth scrolling to the table. diff --git a/WikiPopulationScraper/bin/data_extraction/WebTableDataExtraction.class b/WikiPopulationScraper/bin/data_extraction/WebTableDataExtraction.class deleted file mode 100644 index 1811f0f..0000000 Binary files a/WikiPopulationScraper/bin/data_extraction/WebTableDataExtraction.class and /dev/null differ diff --git a/WikiPopulationScraper/src/data_extraction/WebTableDataExtraction.java b/WikiPopulationScraper/src/data_extraction/WebTableDataExtraction.java deleted file mode 100644 index dbb0edc..0000000 --- a/WikiPopulationScraper/src/data_extraction/WebTableDataExtraction.java +++ /dev/null @@ -1,74 +0,0 @@ -package data_extraction; - -import java.time.Duration; -import java.util.List; -import org.openqa.selenium.By; -import org.openqa.selenium.JavascriptExecutor; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.WebElement; -import org.openqa.selenium.chrome.ChromeDriver; -import org.openqa.selenium.support.ui.ExpectedConditions; -import org.openqa.selenium.support.ui.WebDriverWait; - -public class WebTableDataExtraction { - - public static void main(String[] args) { - WebDriver driver = new ChromeDriver(); - driver.manage().window().maximize(); - - try { - // Print the message before extraction - System.out.println("List of countries and territories by total population"); - - // Open the Wikipedia page - WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10)); - driver.get("https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"); - waitForTheUser(); - - JavascriptExecutor scrollDownOne = (JavascriptExecutor) driver; - scrollDownOne.executeScript("window.scrollBy(0,1200)"); - waitForTheUser(); - - // Locate the table by XPath - WebElement table = wait.until(ExpectedConditions.elementToBeClickable(By.xpath("//table[@class='wikitable sortable sticky-header sort-under mw-datatable col2left col6left jquery-tablesorter']"))); - waitForTheUser(); - - // Get all rows from the table (skip the first row as it's the header) - List rows = table.findElements(By.tagName("tr")); - - // Loop through each row - for (int i = 1; i < rows.size(); i++) { // Start from 1 to skip header - WebElement row = rows.get(i); - - // Get all columns (td) in the current row - List cols = row.findElements(By.tagName("td")); - - // Extract the Location, Population, % of world, Date, Source and Notes - if (cols.size() > 1) { - String location = cols.get(0).getText().trim(); // Location (1st column) - String population = cols.get(1).getText().trim(); // Population (2nd column) - String perc_world = cols.get(2).getText().trim(); // % of world (3rd column) - String date = cols.get(3).getText().trim(); // Date (4th column) - String source = cols.get(4).getText().trim(); // Source (5th column) - String notes = cols.get(5).getText().trim(); // Notes (6th column) - - // Print the extracted data - System.out.println("Location: " + location + " | Population: " + population + " | % of world: " + perc_world + " | Date: " + date + " | Source: " + source + " | Notes: " + notes); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - // Close the browser after scraping - driver.quit(); - } - } - - public static void waitForTheUser() { - try { - Thread.sleep(2000); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } -}