import math
import re
import urllib3
from typing import List, Tuple
import pandas as pd
from bs4 import BeautifulSoup
def get_webpage_tables(url: str) -> List[BeautifulSoup]:
"""
Downloads a webpage from the given url and extracts all the tables using BeautifulSoup.
Parameters:
url (str): The url of the webpage to download.
Returns:
List[BeautifulSoup]: A list of BeautifulSoup objects, one for each table on the webpage.
"""
= urllib3.PoolManager()
http = http.request("GET", url)
response = BeautifulSoup(response.data, features="html.parser")
soup = soup.find_all('table')
tables return tables
def process_num(num: str) :
"""
Processes a string representing a number to a float.
Parameters:
num (str): A string representing a number.
Returns:
Union[float, str]: The float representation of the input string, or the original string if it cannot be converted.
"""
try:
return float(re.sub(r'[^\d.-]', '', num))
except ValueError:
return num
def is_valid_cell(cells: List[BeautifulSoup]) -> bool:
"""
Determines if a list of BeautifulSoup objects representing table cells is valid.
A cell is considered invalid if its text content is 'N/A'.
Parameters:
cells (List[BeautifulSoup]): A list of BeautifulSoup objects representing table cells.
Returns:
bool: True if all the cells are valid, False otherwise.
"""
return all(cell.text.strip() != 'N/A' for cell in cells)
def make_str_valid(s: str) -> str:
"""
Makes a string valid by removing any text in parentheses.
Parameters:
s (str): The string to make valid.
Returns:
str: The input string with any text in parentheses removed.
"""
= s.find('(')
ind_delim if ind_delim != -1:
= s[:ind_delim-1]
wrd else:
= s
wrd return wrd
def extract_country_sex_ratio(tables: List[BeautifulSoup]) -> pd.DataFrame:
"""
Extracts country sex ratios from a list of BeautifulSoup tables.
Parameters:
tables (List[BeautifulSoup]): A list of BeautifulSoup tables.
Returns:
pd.DataFrame: A pandas DataFrame containing the extracted country sex ratio data.
"""
= []
data for table in tables:
= table.find_all('tr')
rows for row in rows:
= row.find_all('td')
cells if len(cells) > 1 and is_valid_cell(cells):
= make_str_valid(cells[0].text.strip())
country = process_num(cells[-1].text.strip())
sex_ratio
data.append([country, sex_ratio])
= pd.DataFrame(data, columns=['Country', 'Sex-Ratio'])
df return df
def get_country_codes(df) -> pd.DataFrame:
"""
Retrieves ISO country codes from a GitHub repository and returns a cleaned DataFrame
"""
= pd.read_csv('https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv')
country_raw = country_raw.iloc[:, [0,2]]
df_c = df_c.rename(columns={'name':'Country', 'alpha-3':'ISO-code'})
df_c = df_c.drop_duplicates(subset=['Country'], keep='last').dropna()
df_c
# Manually modify some of the country names to match the names in the main DataFrame
'Country']=='Viet Nam'].index.values[0], 'Country'] = 'Vietnam'
df_c.at[df_c[df_c['Country']=='United States of America'].index.values[0], 'Country'] = 'United States'
df_c.at[df_c[df_c['Country']=='Iran (Islamic Republic of)'].index.values[0], 'Country'] = 'Iran'
df_c.at[df_c[df_c['Country']=='Russian Federation'].index.values[0], 'Country'] = 'Russia'
df_c.at[df_c[df_c['Country']=='United Kingdom of Great Britain and Northern Ireland'].index.values[0], 'Country'] = 'United Kingdom'
df_c.at[df_c[df_c['Country']=='Venezuela (Bolivarian Republic of)'].index.values[0], 'Country'] = 'Venezuela'
df_c.at[df_c[df_c['Country']=="Korea (Democratic People's Republic of)"].index.values[0], 'Country'] = 'Korea, North'
df_c.at[df_c[df_c['Country']=='Korea, Republic of'].index.values[0], 'Country' ] = 'Korea, South'
df_c.at[ df_c[df_c['Country']=='Bolivia (Plurinational State of)'].index.values[0], 'Country' ] = 'Bolivia'
df_c.at[ df_c[df_c['Country']=='Côte d\'Ivoire'].index.values[0], 'Country' ] = 'Ivory Coast'
df_c.at[ df_c[df_c['Country']=='Congo'].index.values[0], 'Country' ] = 'Congo, Republic of the'
df_c.at[ df_c[df_c['Country']=='Tanzania, United Republic of'].index.values[0], 'Country' ] = 'Tanzania'
df_c.at[ df_c[df_c[# Using the ISO-3166 coding standard to map countries
'ISO-code'] = df['Country'].map(df_c.set_index('Country')['ISO-code'])
df[# Clean data-frame ( Duplicates & NaNs )
sum()
df.isna().= df.dropna()
df return df
Part 1 : Webscraping along with Pipeline
Introduction:
Web scraping is an automated process to extract information from the internet. It is a powerful tool that can help businesses and individuals gather useful information from various sources on the internet. The data can be used for various purposes such as market research, sentiment analysis, trend analysis, etc. In this blog, we will discuss why web scraping is necessary and how it can be done using Python.
Motivation:
With the proliferation of the internet and the increasing availability of information online, there is a need for an efficient way to collect and analyze this information. Web scraping offers a solution to this problem by allowing users to extract large amounts of data from the internet quickly and easily. This data can then be used for various purposes such as market research, trend analysis, and competitor analysis, to name a few.
Why we need webscraping?
Web scraping is essential because it allows us to collect and analyze data that is not readily available in a structured format. It helps us in understanding market trends, product reviews, and pricing strategies. It also enables us to gather data that can help us gain a competitive advantage in the market. Web scraping can be used for a wide range of applications such as e-commerce, data analysis, and sentiment analysis.
For example, imagine you own an online clothing store and want to monitor the prices of a popular competitor’s clothing items. You could use a web scraper to extract the prices of those items from their website and store the data in a spreadsheet. Then, you could set up an automated notification to alert you when a price drops below a certain threshold. This would give you a competitive advantage by allowing you to adjust your prices accordingly.
In this way, web scraping can save you time, help you make more informed decisions, and give you a leg up on the competition.
Example of the webscraping using below code:
The following code is an example of how web scraping can be done using Python. The code scrapes a webpage and extracts all the tables from the webpage using BeautifulSoup. It then extracts the country sex ratios from the tables and cleans the data to retrieve ISO country codes from a GitHub repository.
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
= 'https://en.wikipedia.org/wiki/List_of_countries_by_sex_ratio'
url
# define pipeline
= Pipeline([
pipeline 'get_tables', FunctionTransformer(get_webpage_tables)),
('extract_ratio', FunctionTransformer(extract_country_sex_ratio)),
('get_codes', FunctionTransformer(get_country_codes))
(
])
# apply pipeline to input URL
= pipeline.transform(url)
df
df
/usr/local/lib/python3.8/dist-packages/urllib3/connectionpool.py:842: InsecureRequestWarning:
Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
Country | Sex-Ratio | ISO-code | |
---|---|---|---|
1 | Afghanistan | 1.03 | AFG |
2 | Albania | 0.98 | ALB |
3 | Algeria | 1.03 | DZA |
4 | American Samoa | 1.0 | ASM |
5 | Andorra | 1.06 | AND |
... | ... | ... | ... |
225 | Wallis and Futuna | 1.04 | WLF |
227 | Western Sahara | 0.99 | ESH |
228 | Yemen | 1.02 | YEM |
229 | Zambia | 1.0 | ZMB |
230 | Zimbabwe | 0.96 | ZWE |
209 rows × 3 columns
import plotly.express as px
= 1.3
thres = df.drop(df[ df['Sex-Ratio'] > thres ].index)
df_th
# color pallete @ https://plotly.com/python/builtin-colorscales/
= px.choropleth(df_th, locations='ISO-code',
fig ="Sex-Ratio", hover_name="Country",
color=px.colors.sequential.Sunset, projection="natural earth")
color_continuous_scale={'text':'Sex-Ratio per country', 'y':0.95, 'x':0.5, 'xanchor':'center', 'yanchor':'top'})
fig.update_layout(title fig.show()
import plotly.express as px
= 1.3
thres = df.drop(df[ df['Sex-Ratio'] > thres ].index)
df_th
# color pallete @ https://plotly.com/python/builtin-colorscales/
= px.choropleth(df_th, locations='ISO-code',
fig ="Sex-Ratio", hover_name="Country",
color=px.colors.sequential.Sunset, projection="orthographic")
color_continuous_scale={'text':'Sex-Ratio per country', 'y':0.95, 'x':0.5, 'xanchor':'center', 'yanchor':'top'})
fig.update_layout(title fig.show()