From ee4a04a0813bb21a0022b70f50056bde1b2f1b7f Mon Sep 17 00:00:00 2001 From: Nithu Lakshmi Date: Sun, 15 Dec 2024 11:22:42 +0530 Subject: [PATCH 1/2] First Commit --- .../WebTableDataExtraction.class | Bin 4346 -> 0 bytes .../WebTableDataExtraction.java | 74 ------------- .../.classpath | 0 .../.project | 2 +- .../org.eclipse.core.resources.prefs | 0 .../.settings/org.eclipse.jdt.core.prefs | 0 .../Population_Extraction.class | Bin 0 -> 4874 bytes .../Population_Extraction.java | 99 ++++++++++++++++++ 8 files changed, 100 insertions(+), 75 deletions(-) delete mode 100644 WikiPopulationScraper/bin/data_extraction/WebTableDataExtraction.class delete mode 100644 WikiPopulationScraper/src/data_extraction/WebTableDataExtraction.java rename {WikiPopulationScraper => Worldometer_Project}/.classpath (100%) rename {WikiPopulationScraper => Worldometer_Project}/.project (90%) rename {WikiPopulationScraper => Worldometer_Project}/.settings/org.eclipse.core.resources.prefs (100%) rename {WikiPopulationScraper => Worldometer_Project}/.settings/org.eclipse.jdt.core.prefs (100%) create mode 100644 Worldometer_Project/bin/table_extraction/Population_Extraction.class create mode 100644 Worldometer_Project/src/table_extraction/Population_Extraction.java diff --git a/WikiPopulationScraper/bin/data_extraction/WebTableDataExtraction.class b/WikiPopulationScraper/bin/data_extraction/WebTableDataExtraction.class deleted file mode 100644 index 1811f0ffd52968e21c425ba7ee677544878765f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4346 zcmb_g`&$&(8Ga8eJIgYFK~UEivq?3A0;`zV1T;nv3Zm|eptjGnx2KA1Hf)RZ+$TVO-uslH{a20hobCpwyjHQXhz!*M54PEOj> zMk+5YX`6*?Dn02sS()ncUx!`utaRg8hsr9{V}phUfqTN1(JO(lj*ZyFb7u|Pn2-WZ zjZJ-PF4%S~r~I1lsKRD!(QvoG_7Ai`$356eO*5vQac1b8J8By7C8;Qh`!(Drux)Kc zfxM0nsYu1L#;ln&pOux^jvX2vD2eyrT;7ve9gS#WtepZw-w-(7bW+FZb&tN6Mz$k{ zT>=mFnRzeij3?7h!DdWkK55vQq$ge1^c=rGHkb4q&#;m?Cs(ix<)S!RkkZg9u-*@8 z>CQTKV~>EAQ_3uczM(NZ@_PlE!_J*A}y;b9UWmCG!4h_ zn83iK=jHP4sg$%^XUr)xCo`tes*a|5Ddpo)XMA+Y$D{P|Xhst78EL2K^wF`o(Pj54 zgm>!bQeG^dkX{@D-8y`kCQwg?%SziDayn&~#d9kU2=s*m zkUGPSSxc3V?5A~%;u)TknQdA$xOmP2d#Uk)9o!;4#YR~8G>i#s4Lc|EvO73M3>kqk zkKFtu#&t|6&uX4AF<@k66%3d<&R|MFA1CJ(*Dxh_u5nsFr+q6`=8z?`a5RueYD#BR zisTf%^oByuRd!|6YEMT2XL%Ln$(sHR*sDyN)iH->X)beYh!-nEL+18%_#iG`QgJ+o zPipu?Db4y9n2zVw1`&3$b!4Vb={S#1Gt08Zlw{!3hBsn*lWgTZ!}fUYb0J_`t@8gh z5_*~o_^gJ{yvN=CNxNs$(!cz4e9kWheC{6fjP%qH%MAvy9$(P$MKu?hE|_#x#Be8g zp7kqaMNs5*p4T;qhPyuf0ssz#%(Aq(B^XQ6=`knGk5=;4*IBas2}cr zBR8b@NW+U7eo+FFAy#5DYOZ(>YyXJj*Z8f5-;`tq#mM41uHZEuBX3E`9F6b>9**Ox zg5K}QDa*d;wN$Z7 zdXr8jAIA;6q2c!|yq5Kt_E~32rUDQL$=kzH+i`heoX4927gy$PQ99W0abD<_%x~*+ zoT);tSSqaKkQK)%SvbU?g@~_As*u~YxJ8#?S%ao0J8Gl29>X7ahbTlS{Iw@V^10;k zrHU=+Z&#I`54fODXUU)y_%dh@DCDt8&hE8s>2|4GTjn+Vi3Ra0cZE4X!(U2bTyo^1 zuZZ=)Tjbwj;mx6-(8YbMc*<3&$0tFAD4!a8DZT~Y6+5pXc7=^H#Q9XyKsBoP)B^#& ztJ$kW4Qe^U#pRWJt^+gb2G*Ut#suB9i1h-CNT|oX0W15ErQfCkB_%kCSs9Q zgYiVHV%1<(BDSPtX}O+=X{#iw6EP}%pQ0N2AW@l!C#n*9qB>D?9p^v`nqP>DhG>H7 zXyi+`AHIw&{mt`ufgb$4z-838*9Of*?OQ?P3PZC=bc$|V;QOc;WH2Iru+H*#@S9*S8bv3+0D7=r9K}IuJ|Ir;eTtR((|l)e$Op>H0T}QlyhwXu z;xxXDuMjFJu^wN=*J%GiQI4-uUpejg7ru#a@dVTOC%%pE5R)GMj*Iv%!FPc)_&v^Q zl>ITj&)FEQJBJ_O5&?J%Q?&OI&;Blk@gugXXzw9PUV=`moADA`)wF#hUS_L?mPhat zyh00pO3OCz9cX=(S_2KQQp07=?Spokj?pyyOhdhfHVsEK7#gxni2vY5*=^##ESLp# zUFg^F^LPcV;>Sdg7I=#LtMsX@$1fM~tILQz!6bP7ia!{o=v6vEWmz4~0=l`dhy5em zxpyh!;=alAoL6S?{}sI&u12`<8oMfW7Ws~G%eVNuiMQF?%-KKU&-g22znS&Lt!O2& KP>;VwEB^-~xW|P6 diff --git a/WikiPopulationScraper/src/data_extraction/WebTableDataExtraction.java b/WikiPopulationScraper/src/data_extraction/WebTableDataExtraction.java deleted file mode 100644 index dbb0edc..0000000 --- a/WikiPopulationScraper/src/data_extraction/WebTableDataExtraction.java +++ /dev/null @@ -1,74 +0,0 @@ -package data_extraction; - -import java.time.Duration; -import java.util.List; -import org.openqa.selenium.By; -import org.openqa.selenium.JavascriptExecutor; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.WebElement; -import org.openqa.selenium.chrome.ChromeDriver; -import org.openqa.selenium.support.ui.ExpectedConditions; -import org.openqa.selenium.support.ui.WebDriverWait; - -public class WebTableDataExtraction { - - public static void main(String[] args) { - WebDriver driver = new ChromeDriver(); - driver.manage().window().maximize(); - - try { - // Print the message before extraction - System.out.println("List of countries and territories by total population"); - - // Open the Wikipedia page - WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10)); - driver.get("https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"); - waitForTheUser(); - - JavascriptExecutor scrollDownOne = (JavascriptExecutor) driver; - scrollDownOne.executeScript("window.scrollBy(0,1200)"); - waitForTheUser(); - - // Locate the table by XPath - WebElement table = wait.until(ExpectedConditions.elementToBeClickable(By.xpath("//table[@class='wikitable sortable sticky-header sort-under mw-datatable col2left col6left jquery-tablesorter']"))); - waitForTheUser(); - - // Get all rows from the table (skip the first row as it's the header) - List rows = table.findElements(By.tagName("tr")); - - // Loop through each row - for (int i = 1; i < rows.size(); i++) { // Start from 1 to skip header - WebElement row = rows.get(i); - - // Get all columns (td) in the current row - List cols = row.findElements(By.tagName("td")); - - // Extract the Location, Population, % of world, Date, Source and Notes - if (cols.size() > 1) { - String location = cols.get(0).getText().trim(); // Location (1st column) - String population = cols.get(1).getText().trim(); // Population (2nd column) - String perc_world = cols.get(2).getText().trim(); // % of world (3rd column) - String date = cols.get(3).getText().trim(); // Date (4th column) - String source = cols.get(4).getText().trim(); // Source (5th column) - String notes = cols.get(5).getText().trim(); // Notes (6th column) - - // Print the extracted data - System.out.println("Location: " + location + " | Population: " + population + " | % of world: " + perc_world + " | Date: " + date + " | Source: " + source + " | Notes: " + notes); - } - } - } catch (Exception e) { - e.printStackTrace(); - } finally { - // Close the browser after scraping - driver.quit(); - } - } - - public static void waitForTheUser() { - try { - Thread.sleep(2000); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } -} diff --git a/WikiPopulationScraper/.classpath b/Worldometer_Project/.classpath similarity index 100% rename from WikiPopulationScraper/.classpath rename to Worldometer_Project/.classpath diff --git a/WikiPopulationScraper/.project b/Worldometer_Project/.project similarity index 90% rename from WikiPopulationScraper/.project rename to Worldometer_Project/.project index 68201ba..b1285db 100644 --- a/WikiPopulationScraper/.project +++ b/Worldometer_Project/.project @@ -1,6 +1,6 @@ - WikiPopulationScraper + Worldometer_Project diff --git a/WikiPopulationScraper/.settings/org.eclipse.core.resources.prefs b/Worldometer_Project/.settings/org.eclipse.core.resources.prefs similarity index 100% rename from WikiPopulationScraper/.settings/org.eclipse.core.resources.prefs rename to Worldometer_Project/.settings/org.eclipse.core.resources.prefs diff --git a/WikiPopulationScraper/.settings/org.eclipse.jdt.core.prefs b/Worldometer_Project/.settings/org.eclipse.jdt.core.prefs similarity index 100% rename from WikiPopulationScraper/.settings/org.eclipse.jdt.core.prefs rename to Worldometer_Project/.settings/org.eclipse.jdt.core.prefs diff --git a/Worldometer_Project/bin/table_extraction/Population_Extraction.class b/Worldometer_Project/bin/table_extraction/Population_Extraction.class new file mode 100644 index 0000000000000000000000000000000000000000..a17ef467ec8f8d93e34f7031fbe84bf0c2e0c38a GIT binary patch literal 4874 zcmdT|YjhlC8GgRWW@mPH(xj85bhrj>Y0@^?w9q0+OQA^$X_K^c(}sX0o$OAMp}RBN z*;$fs5k&9;;sviLN`Y!oM65`gP$_s<1VIm8v8edV!+($G=mDRZS&}SS3xC~n^3C_X z-|f9V@B8JsH=a5PU@QJELlU^aGe)!Kh&koCM%uF+JGslr7qf=n9qE~)%7_TmU2oiA zB(sJ+p4>Tly_xm|qMIz+@-7vqYHryp5b1IsbVHE_L``#TloxCY$Shm2j=4<=rxk`DiWsi5Z?3J-pV3p&JC!M@$ zA2gB$Gi%ybF_%nFxK7SYb_Jhq*Sf)U$1?9z?nE(##sU@-_-^c=ipr4G-=rxXOhZ!$D4+NmAt44?~!r7 zz>3pf6s}kCUf)tm&X}@t){SNiE3rz(1!eY5O&2^fr=l4xw6#;D=^I1ETTXHpweC^( zrjhGZu!b9l95U{GAyr}j_&{Baa@c`WW0~=7u@ez^nvc88@RcO4$z}w&REJ!J9dVSct-Q~ zmXL+Am7Q^{$7bKP>G2zzPjC3nN(Z#;RdAU=&xGgY3mwViNJq6R1 zaV2BDlJ2%MrVDA;%6mOiX1eG(u7d3htkCV+IOSxsTc?}b*RF4GZ%N<^^r`5_fIy8I zY%o*7Zh=cGSZ!f+hM3V=QtSd4q0{UVxH6dxqW{{<(pjTW*t{wjH3e;JfsvgzGo{g} z<7P~k!;Ik>gFcQl*-q=2<4}dG_QkOqgECSSqLRQ{r&a6$!K!*HZ+H`QsS12o4(hGb z^nj~zjf`P|b(IcZ;E6p`c|xR_>7q+n0R*VH7Vj5mAQt6J+Z%MYnq65deINwJzRLc@ zJ1aADx~l?5y-vjl_ES!Ji6CC7?(2>07col16*F)DCwB{>;pJ)sl|G6IrE z6y1$67302UWzQHNFmh%B1}qiVwV#W$i+G*k9O&>)Q(}Z3Pbb9dHyJB%@tnC@COI@p-&bP^5nkn}(a6Zt9v~ z#xoTVgX06HH#eT<{%+GQSl+aPn+PC%hMj4;oG_^176NL&HSQX=S5R;(cWyIXudQjf z;qjyo`m<)Ht?6?A+z)Yfk2`AEK47j?aHl{b1kluk;WC}J;v*_PijOfF1Q0nUu&IL2 z{_j{Thg&N2Zh^;I|DS#d+=EZb_ymEi+&g^u`;?zCdMo0}y}oZ|{N0~b@i}~+g+qZk zE~r;}efRvLiu>>-T9WuV$SR28$vhj}5g=5lq=@5wd{xF*%9CPHZm4*`KOn+EPQimr z9*d##qIP0u91r7bG7i6$1cUQ?rqX6mAgK6yP%s4+PI*T9z#uC!hEM|@QSl9bE^@GF zk->=Jj?-Q17uvhTCOsq*)HDL;y+d~5Iev-|Xsftz0lJi>R3^%rFeh|7llD%Vs;nHz zi}MP%RtIP<&{Xlxc~Iffi0e#J%DT`N^JpL-EdKSo0GQb%?;#C7}$(znkh5FI5hkksT9&Ak1?J{SU^w#a?a!mzZy&w z{Wu=sHGUbB2=e$y!H?oHwGB=ZgKFp1I5^>&T(M-Hvdt;Fl#;|!nN{vA)X%c=I(j<) zRxQ2$Z5rG3PP)F-w`t&U{YE~>rZOIt@#``_C5z6apQuVTmHbj1zs2um{H`n{d`XOq zia+3ww00qDnt7Ut)zscNUh%#E&rB%u(ba1+W4pyX)8t$r_}+2)734RbqJq-qHp_of zoH-BN3!W@({vpgey|!(-U4BVn(nI7@InGE4%U_(&s;Md{6RX(ctA(x1agtxbZ}sZq zP>!-AAGw!C8#BZo2zxMTgENVoIu_1ar)(&S=0;6Lid~V1ip(E z+UjFidjji+*EHaw8Eia`P5o=MEwi{>!VJ3CJc}y^TD9J)r%^SF9aR`2ajmv<23O5s z=ot1r3!@`KnjX>8Gcac{A)%w%KR8>ba?@~?W~YWDT0S*gt+}b;sOF`HW$lL4u%b<+ zhU=o*jcjG@X10p<0k$#iHnwr?cD4!aj?^)Hc*sBRE;1U~7(F6x)vFt$v-r3K|8iNE zwNKC8tmw)tK0|9C7JGDsEQhOR@dbfn_;N=~S0amuab2liL`>+)+$D3{Rb7!6t*p@% zUi!9+mgq{|;{COxE)0y#QnoQ&V;k4&*(UUczzlapMPpQ_uBhP$-?+(lj*h4v)ee=p zM?m(?Qh`kSyp4PWXZNo?hKJ~zM+AW$BYouhR6<6HZG`+LO()BEQAUG|^)jxIVaT|L0Qqk*CB2Chq;M8O#>4$G zUW!)}Ciy@OVUl{50tl^bz;9;o@)0OI2#S9?8W5vg`U+L+qpn7_fa|z%gya20+R-@# sjt4BC;=Iom|Etw2;53n5S(#Q*>R literal 0 HcmV?d00001 diff --git a/Worldometer_Project/src/table_extraction/Population_Extraction.java b/Worldometer_Project/src/table_extraction/Population_Extraction.java new file mode 100644 index 0000000..69b3d31 --- /dev/null +++ b/Worldometer_Project/src/table_extraction/Population_Extraction.java @@ -0,0 +1,99 @@ +package table_extraction; + +import java.time.Duration; +import java.util.List; +import org.openqa.selenium.By; +import org.openqa.selenium.JavascriptExecutor; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.support.ui.ExpectedConditions; +import org.openqa.selenium.support.ui.WebDriverWait; + +public class Population_Extraction { + + public static void main(String[] args) { + WebDriver driver = new ChromeDriver(); + driver.manage().window().maximize(); + + try { + // Print the message before extraction + System.out.println("World Population"); + + WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10)); + driver.get("https://www.worldometers.info/world-population/population-by-country/"); + waitForTheUser(); + + JavascriptExecutor scrollDownOne = (JavascriptExecutor) driver; + scrollDownOne.executeScript("window.scrollBy(0,200)"); + waitForTheUser(); + + // Locate the table by XPath + WebElement table = wait.until(ExpectedConditions.elementToBeClickable(By.xpath("//table[@class='table table-striped table-bordered dataTable no-footer']"))); + waitForTheUser(); + + // Get all rows from the table (skip the first row as it's the header) + List rows = table.findElements(By.tagName("tr")); + + // Print headers + System.out.printf("%-5s %-30s %-15s %-15s %-15s %-10s %-15s %-10s %-10s %-10s %-15s %-10s%n", + "No", "Country", "Population", "Yearly Change", "Net Change", "Density", "Land Area", + "Migrants", "Fert. Rate", "Med. Age", "Urban Pop%", "World Share"); + System.out.println("-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------"); + + // Loop through each row + for (int i = 1; i < rows.size(); i++) { + WebElement row = rows.get(i); + + // Get all columns (td) in the current row + List cols = row.findElements(By.tagName("td")); + + // Extract the No, Country, Population, Yearly Change, Net Change, Density, Land Area, Migrants, Fert. Rate, Med. Age, Urban Pop% and World Share + if (cols.size() > 1) { + String nos = cols.get(0).getText().trim(); // No (1st column) + String country = cols.get(1).getText().trim(); // Country (2nd column) + String population = cols.get(2).getText().trim(); // Population (3rd column) + String yearly_change = cols.get(3).getText().trim(); // Yearly Change (4th column) + String net_change = cols.get(4).getText().trim(); // Net Change (5th column) + String density = cols.get(5).getText().trim(); // Density (6th column) + String land_area = cols.get(6).getText().trim(); // Land Area (7th column) + String migrants = cols.get(7).getText().trim(); // Migrants (8th column) + String fert_rate = cols.get(8).getText().trim(); // Fert. Rate (9th column) + String med_age = cols.get(9).getText().trim(); // Med. Age (10th column) + String urban_popln = cols.get(10).getText().trim(); // Urban Pop% (11th column) + String world_share = cols.get(11).getText().trim(); // World Share (12th column) + + // Print the data in table format + System.out.printf("%-5s %-30s %-15s %-15s %-15s %-10s %-15s %-10s %-10s %-10s %-15s %-10s%n", + nos, country, population, yearly_change, net_change, density, land_area, migrants, + fert_rate, med_age, urban_popln, world_share); + } + } + + JavascriptExecutor scrollDownTwo = (JavascriptExecutor) driver; + scrollDownTwo.executeScript("window.scrollBy(0,4000)"); + waitForTheUser(); + + JavascriptExecutor scrollDownThree = (JavascriptExecutor) driver; + scrollDownThree.executeScript("window.scrollBy(0,4000)"); + waitForTheUser(); + + JavascriptExecutor scrollDownFour = (JavascriptExecutor) driver; + scrollDownFour.executeScript("window.scrollBy(0,4000)"); + waitForTheUser(); + + } catch (Exception e) { + e.printStackTrace(); + } finally { + driver.quit(); + } + } + + public static void waitForTheUser() { + try { + Thread.sleep(2000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } +} From afbdc7ebe55b25501252f4b15ef9a2b02dbf2f6a Mon Sep 17 00:00:00 2001 From: Nithu Lakshmi <110821617+niths09@users.noreply.github.com> Date: Sun, 15 Dec 2024 11:23:19 +0530 Subject: [PATCH 2/2] First README.md --- README.md | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index f4eeb3f..07cb79c 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,36 @@ -# Wikipedia Population Table Data Extraction +# World Population Data Extraction -This project demonstrates web scraping using Selenium WebDriver to extract data from an HTML table on a Wikipedia page. Specifically, it retrieves the list of countries and territories by total population, presenting a real-world example of data extraction and automation using Selenium. +This project automates the extraction of country-wise population data from the Worldometers website using Selenium WebDriver. The script captures key statistics, such as population, density, and urban population percentage and displays them in a tabular format in the console. ## Features -- Automates navigation to a Wikipedia page on country populations. -- Extracts data from an HTML table, including: - - **Location** (Country or territory) - - **Population** - - **Percentage of world population** - - **Date** of population data - - **Source** of the data - - **Notes** -- Processes table rows dynamically to handle updates to the table structure or content. -- Uses JavaScript for smooth scrolling to the target table. +- **Automated Web Navigation**: Accesses the Worldometers Population by Country page programmatically. +- **Dynamic Table Parsing**: Extracts data dynamically from an HTML table using Selenium. +- **Data Columns Extracted**: + - Rank (No) + - Country + - Population + - Yearly Change + - Net Change + - Density + - Land Area + - Migrants + - Fertility Rate + - Median Age + - Urban Population (%) + - World Share +- **Formatted Console Output**: Displays the extracted data in a neatly formatted table in the console. +- **Smooth Scrolling**: Uses JavaScript Executor to scroll through the page for visibility of dynamic content. ## Prerequisites -Ensure you have the following before running the project: -1. **Java Development Kit (JDK)** - Version 8 or above. -2. **Google Chrome** - Latest stable version. -3. **ChromeDriver** - Version compatible with your Chrome browser. -4. **Selenium WebDriver** - Included in the project dependencies. +Ensure the following are set up on your system: +1. **Java Development Kit (JDK)** - Version 8 or above. +2. **Selenium WebDriver** - Include the required Selenium libraries in your project. +3. **Google Chrome** - Latest stable version. +4. **ChromeDriver** - Ensure the ChromeDriver version matches your browser version. +5. **Integrated Development Environment (IDE)** - Any IDE like IntelliJ IDEA or Eclipse for running Java programs. ## Technologies Used -- **Java** - The programming language for the project. -- **Selenium WebDriver** - For web element interaction and automation. -- **Google Chrome & ChromeDriver** - For browser-based automation. -- **JavaScript Executor** - For advanced browser interactions like scrolling. +- **Java** - The programming language used for the project. +- **Selenium WebDriver** - For web automation and data extraction. +- **Google Chrome & ChromeDriver** - To simulate browser-based interaction. +- **JavaScript Executor** - For smooth scrolling through the web page.