leroncheung.github.io

Leroncheung's first python scraping copy from cookbook

import re # regular expression re.findall(r'(.?)', html) re.findall('''.?<tdsclass=["']w2p_fw["']>(.?)''', html)

from bs4 import BeautifulSoup html = download(url) soup = BeautifulSoup(html, 'html.parse') soup = BeautifulSoup(html, 'html5lib') tr = soup.find(attrs={'id':'places_area_row'}) td = tr.find(attrs={'class':'w2p_fw'}) area = td.text

tree = fromstring(html) td = tree.cssselect('tr#places_area_row > td.w2p_fw')[0] area = td.text_content()

FIELDS = ('area', 'population', 'iso', 'country_or_district', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')

import re def re_scraper(html): results = {} for field in FIELDS: results[field] = re.search('.?(.?)' % field, html).groups()[0] return results

from bs4 import BeautifulSoup def bs_scraper(html): soup = BeautifulSoup(html, 'html_parse') results = {} for filed in FIELDS: results[filed] = soup.find('table').find('tr', id='places_%s_row' % filed).find('td', class_='w2p_fw').TEXT return results

from lxml.html import fromstring def lxml_scraper(html): tree = fromstring(html) results = {} for field in FIELDS: results[field] = tree.cssselect('table > tr#places_%s_row > td.w2p_fw' % field)[0].text_content() return results

def lxml_xpath_scraper(html): tree = fromstring(html) results = {} for field in FIELDS: results[field] = tree.xpath('//tr[@id="places_%s_row"]/td[@class="w2p_fw"]' % field)[0].text_content() return results

Name		Name	Last commit message	Last commit date
Latest commit History 5 Commits
code		code
stylesheets		stylesheets
.gitignore		.gitignore
LICENSE		LICENSE
README.md		README.md
index.html		index.html
params.json		params.json

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

leroncheung.github.io

About

Uh oh!

Releases

Packages

Uh oh!

Contributors

Uh oh!

Languages

Folders and files

Latest commit

History

Repository files navigation

leroncheung.github.io

About

Resources

License

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Uh oh!

Contributors

Uh oh!

Languages

Packages