diff --git a/README.md b/README.md index f4eeb3f..07cb79c 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,36 @@ -# Wikipedia Population Table Data Extraction +# World Population Data Extraction -This project demonstrates web scraping using Selenium WebDriver to extract data from an HTML table on a Wikipedia page. Specifically, it retrieves the list of countries and territories by total population, presenting a real-world example of data extraction and automation using Selenium. +This project automates the extraction of country-wise population data from the Worldometers website using Selenium WebDriver. The script captures key statistics, such as population, density, and urban population percentage and displays them in a tabular format in the console. ## Features -- Automates navigation to a Wikipedia page on country populations. -- Extracts data from an HTML table, including: - - **Location** (Country or territory) - - **Population** - - **Percentage of world population** - - **Date** of population data - - **Source** of the data - - **Notes** -- Processes table rows dynamically to handle updates to the table structure or content. -- Uses JavaScript for smooth scrolling to the target table. +- **Automated Web Navigation**: Accesses the Worldometers Population by Country page programmatically. +- **Dynamic Table Parsing**: Extracts data dynamically from an HTML table using Selenium. +- **Data Columns Extracted**: + - Rank (No) + - Country + - Population + - Yearly Change + - Net Change + - Density + - Land Area + - Migrants + - Fertility Rate + - Median Age + - Urban Population (%) + - World Share +- **Formatted Console Output**: Displays the extracted data in a neatly formatted table in the console. +- **Smooth Scrolling**: Uses JavaScript Executor to scroll through the page for visibility of dynamic content. ## Prerequisites -Ensure you have the following before running the project: -1. **Java Development Kit (JDK)** - Version 8 or above. -2. **Google Chrome** - Latest stable version. -3. **ChromeDriver** - Version compatible with your Chrome browser. -4. **Selenium WebDriver** - Included in the project dependencies. +Ensure the following are set up on your system: +1. **Java Development Kit (JDK)** - Version 8 or above. +2. **Selenium WebDriver** - Include the required Selenium libraries in your project. +3. **Google Chrome** - Latest stable version. +4. **ChromeDriver** - Ensure the ChromeDriver version matches your browser version. +5. **Integrated Development Environment (IDE)** - Any IDE like IntelliJ IDEA or Eclipse for running Java programs. ## Technologies Used -- **Java** - The programming language for the project. -- **Selenium WebDriver** - For web element interaction and automation. -- **Google Chrome & ChromeDriver** - For browser-based automation. -- **JavaScript Executor** - For advanced browser interactions like scrolling. +- **Java** - The programming language used for the project. +- **Selenium WebDriver** - For web automation and data extraction. +- **Google Chrome & ChromeDriver** - To simulate browser-based interaction. +- **JavaScript Executor** - For smooth scrolling through the web page. diff --git a/WikiPopulationScraper/bin/data_extraction/WebTableDataExtraction.class b/WikiPopulationScraper/bin/data_extraction/WebTableDataExtraction.class deleted file mode 100644 index 1811f0f..0000000 Binary files a/WikiPopulationScraper/bin/data_extraction/WebTableDataExtraction.class and /dev/null differ diff --git a/WikiPopulationScraper/src/data_extraction/WebTableDataExtraction.java b/WikiPopulationScraper/src/data_extraction/WebTableDataExtraction.java deleted file mode 100644 index dbb0edc..0000000 --- a/WikiPopulationScraper/src/data_extraction/WebTableDataExtraction.java +++ /dev/null @@ -1,74 +0,0 @@ -package data_extraction; - -import java.time.Duration; -import java.util.List; -import org.openqa.selenium.By; -import org.openqa.selenium.JavascriptExecutor; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.WebElement; -import org.openqa.selenium.chrome.ChromeDriver; -import org.openqa.selenium.support.ui.ExpectedConditions; -import org.openqa.selenium.support.ui.WebDriverWait; - -public class WebTableDataExtraction { - - public static void main(String[] args) { - WebDriver driver = new ChromeDriver(); - driver.manage().window().maximize(); - - try { - // Print the message before extraction - System.out.println("List of countries and territories by total population"); - - // Open the Wikipedia page - WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10)); - driver.get("https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"); - waitForTheUser(); - - JavascriptExecutor scrollDownOne = (JavascriptExecutor) driver; - scrollDownOne.executeScript("window.scrollBy(0,1200)"); - waitForTheUser(); - - // Locate the table by XPath - WebElement table = wait.until(ExpectedConditions.elementToBeClickable(By.xpath("//table[@class='wikitable sortable sticky-header sort-under mw-datatable col2left col6left jquery-tablesorter']"))); - waitForTheUser(); - - // Get all rows from the table (skip the first row as it's the header) - List rows = table.findElements(By.tagName("tr")); - - // Loop through each row - for (int i = 1; i < rows.size(); i++) { // Start from 1 to skip header - WebElement row = rows.get(i); - - // Get all columns (td) in the current row - List cols = row.findElements(By.tagName("td")); - - // Extract the Location, Population, % of world, Date, Source and Notes - if (cols.size() > 1) { - String location = cols.get(0).getText().trim(); // Location (1st column) - String population = cols.get(1).getText().trim(); // Population (2nd column) - String perc_world = cols.get(2).getText().trim(); // % of world (3rd column) - String date = cols.get(3).getText().trim(); // Date (4th column) - String source = cols.get(4).getText().trim(); // Source (5th column) - String notes = cols.get(5).getText().trim(); // Notes (6th column) - - // Print the extracted data - System.out.println("Location: " + location + " | Population: " + population + " | % of world: " + perc_world + " | Date: " + date + " | Source: " + source + " | Notes: " + notes); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - // Close the browser after scraping - driver.quit(); - } - } - - public static void waitForTheUser() { - try { - Thread.sleep(2000); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } -} diff --git a/WikiPopulationScraper/.classpath b/Worldometer_Project/.classpath similarity index 100% rename from WikiPopulationScraper/.classpath rename to Worldometer_Project/.classpath diff --git a/WikiPopulationScraper/.project b/Worldometer_Project/.project similarity index 90% rename from WikiPopulationScraper/.project rename to Worldometer_Project/.project index 68201ba..b1285db 100644 --- a/WikiPopulationScraper/.project +++ b/Worldometer_Project/.project @@ -1,6 +1,6 @@ - WikiPopulationScraper + Worldometer_Project diff --git a/WikiPopulationScraper/.settings/org.eclipse.core.resources.prefs b/Worldometer_Project/.settings/org.eclipse.core.resources.prefs similarity index 100% rename from WikiPopulationScraper/.settings/org.eclipse.core.resources.prefs rename to Worldometer_Project/.settings/org.eclipse.core.resources.prefs diff --git a/WikiPopulationScraper/.settings/org.eclipse.jdt.core.prefs b/Worldometer_Project/.settings/org.eclipse.jdt.core.prefs similarity index 100% rename from WikiPopulationScraper/.settings/org.eclipse.jdt.core.prefs rename to Worldometer_Project/.settings/org.eclipse.jdt.core.prefs diff --git a/Worldometer_Project/bin/table_extraction/Population_Extraction.class b/Worldometer_Project/bin/table_extraction/Population_Extraction.class new file mode 100644 index 0000000..a17ef46 Binary files /dev/null and b/Worldometer_Project/bin/table_extraction/Population_Extraction.class differ diff --git a/Worldometer_Project/src/table_extraction/Population_Extraction.java b/Worldometer_Project/src/table_extraction/Population_Extraction.java new file mode 100644 index 0000000..69b3d31 --- /dev/null +++ b/Worldometer_Project/src/table_extraction/Population_Extraction.java @@ -0,0 +1,99 @@ +package table_extraction; + +import java.time.Duration; +import java.util.List; +import org.openqa.selenium.By; +import org.openqa.selenium.JavascriptExecutor; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.support.ui.ExpectedConditions; +import org.openqa.selenium.support.ui.WebDriverWait; + +public class Population_Extraction { + + public static void main(String[] args) { + WebDriver driver = new ChromeDriver(); + driver.manage().window().maximize(); + + try { + // Print the message before extraction + System.out.println("World Population"); + + WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10)); + driver.get("https://www.worldometers.info/world-population/population-by-country/"); + waitForTheUser(); + + JavascriptExecutor scrollDownOne = (JavascriptExecutor) driver; + scrollDownOne.executeScript("window.scrollBy(0,200)"); + waitForTheUser(); + + // Locate the table by XPath + WebElement table = wait.until(ExpectedConditions.elementToBeClickable(By.xpath("//table[@class='table table-striped table-bordered dataTable no-footer']"))); + waitForTheUser(); + + // Get all rows from the table (skip the first row as it's the header) + List rows = table.findElements(By.tagName("tr")); + + // Print headers + System.out.printf("%-5s %-30s %-15s %-15s %-15s %-10s %-15s %-10s %-10s %-10s %-15s %-10s%n", + "No", "Country", "Population", "Yearly Change", "Net Change", "Density", "Land Area", + "Migrants", "Fert. Rate", "Med. Age", "Urban Pop%", "World Share"); + System.out.println("-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------"); + + // Loop through each row + for (int i = 1; i < rows.size(); i++) { + WebElement row = rows.get(i); + + // Get all columns (td) in the current row + List cols = row.findElements(By.tagName("td")); + + // Extract the No, Country, Population, Yearly Change, Net Change, Density, Land Area, Migrants, Fert. Rate, Med. Age, Urban Pop% and World Share + if (cols.size() > 1) { + String nos = cols.get(0).getText().trim(); // No (1st column) + String country = cols.get(1).getText().trim(); // Country (2nd column) + String population = cols.get(2).getText().trim(); // Population (3rd column) + String yearly_change = cols.get(3).getText().trim(); // Yearly Change (4th column) + String net_change = cols.get(4).getText().trim(); // Net Change (5th column) + String density = cols.get(5).getText().trim(); // Density (6th column) + String land_area = cols.get(6).getText().trim(); // Land Area (7th column) + String migrants = cols.get(7).getText().trim(); // Migrants (8th column) + String fert_rate = cols.get(8).getText().trim(); // Fert. Rate (9th column) + String med_age = cols.get(9).getText().trim(); // Med. Age (10th column) + String urban_popln = cols.get(10).getText().trim(); // Urban Pop% (11th column) + String world_share = cols.get(11).getText().trim(); // World Share (12th column) + + // Print the data in table format + System.out.printf("%-5s %-30s %-15s %-15s %-15s %-10s %-15s %-10s %-10s %-10s %-15s %-10s%n", + nos, country, population, yearly_change, net_change, density, land_area, migrants, + fert_rate, med_age, urban_popln, world_share); + } + } + + JavascriptExecutor scrollDownTwo = (JavascriptExecutor) driver; + scrollDownTwo.executeScript("window.scrollBy(0,4000)"); + waitForTheUser(); + + JavascriptExecutor scrollDownThree = (JavascriptExecutor) driver; + scrollDownThree.executeScript("window.scrollBy(0,4000)"); + waitForTheUser(); + + JavascriptExecutor scrollDownFour = (JavascriptExecutor) driver; + scrollDownFour.executeScript("window.scrollBy(0,4000)"); + waitForTheUser(); + + } catch (Exception e) { + e.printStackTrace(); + } finally { + driver.quit(); + } + } + + public static void waitForTheUser() { + try { + Thread.sleep(2000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } +}