Web skanerlash va ma'lumot yig‘ish dasturlarini yaratish uchun Python’da requests, BeautifulSoup va re (muntazam ifodalar) kutubxonalaridan foydalanamiz. Bu vositalar yordamida veb-sahifalarni o‘qish, kerakli ma’lumotlarni ajratish va qayta ishlash mumkin.
1 Zarur kutubxonalarni o‘rnatish
requests va BeautifulSoup kutubxonalarini quyidagi buyruq orqali o‘rnating:
pip install requests beautifulsoup4
2 Veb-saytga So'rov Yuborish va Sahifani Yuklab Olish
Birinchi qadamlardan biri – veb-saytga HTTP so‘rov yuborib, sahifa tarkibini yuklab olishdir. Quyidagi funksiyada requests kutubxonasidan foydalanib, sahifa ma’lumotini yuklaymiz.
import requests
def get_html(url):
"""
Berilgan URL'dan HTML ma'lumotni yuklab olish funksiyasi.
"""
try:
response = requests.get(url) # GET so'rovi yuborish
response.raise_for_status() # Xatolarni tekshirish
return response.text # HTML matnini qaytarish
except requests.RequestException as e:
print(f"So'rovda xatolik yuz berdi: {e}")
return None
# Sinov uchun URL manzil
html = get_html("https://example.com")
print(html[:200]) # HTML matnining bir qismini chiqarish
Tahlil:
requests.get(url) – Berilgan URL manzilga GET so‘rovi yuboradi va javobni response o‘zgaruvchisiga saqlaydi.
response.raise_for_status() – Agar HTTP so‘rovda xato bo‘lsa (404 yoki 500 kabi), xatolikni chiqaradi.
response.text – Javobdagi HTML matnni qaytaradi.
3 Sahifadan Ma'lumot Ajratib Olish (Veb-parsing)
Sahifa tarkibidagi ma’lumotlarni ajratish uchun BeautifulSoup kutubxonasidan foydalanamiz. Quyidagi funksiya berilgan HTML matndan kerakli title ma’lumotini ajratib oladi.
from bs4 import BeautifulSoup
def get_page_title(html):
"""
HTML tarkibidan sahifa nomini (title) ajratish funksiyasi.
"""
soup = BeautifulSoup(html, 'html.parser') # HTML'ni analiz qilish
title = soup.title.string if soup.title else "Noma'lum"
return title
# Sahifa nomini olish
title = get_page_title(html)
print("Sahifa nomi:", title)
Tahlil:
BeautifulSoup(html, 'html.parser') – HTML matnini analiz qilish uchun BeautifulSoup obyekti yaratadi.
soup.title.string – Sahifaning <title> tegidagi matnni olish uchun foydalaniladi.
4 Sahifadagi Barcha Havolalarni (Linklarni) Topish
Sahifadagi barcha a teglarini topib, URL manzillarni ajratib olamiz. Bu sahifada mavjud ichki va tashqi havolalarni tekshirish imkonini beradi.
def get_all_links(html):
"""
HTML tarkibidan barcha havolalarni ajratish funksiyasi.
"""
soup = BeautifulSoup(html, 'html.parser')
links = []
for link in soup.find_all('a', href=True): # Barcha <a> teglarini topish
links.append(link['href'])
return links
# Barcha havolalarni olish
links = get_all_links(html)
print("Topilgan havolalar:", links)
Tahlil:
soup.find_all('a', href=True) – Sahifadagi barcha href atributiga ega a teglarini topadi.
links.append(link['href']) – Har bir a tegidagi href qiymatini links ro‘yxatiga qo‘shadi.
5 Muntazam Ifodalar yordamida Maxsus Ma'lumotlarni Topish
Ba’zan sahifadagi maxsus ma’lumotlarni (masalan, telefon raqamlari yoki elektron pochta manzillari) ajratib olish uchun re kutubxonasidan foydalanish zarur.
import re
def find_emails(html):
"""
HTML tarkibidan elektron pochta manzillarini topish funksiyasi.
"""
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails = re.findall(email_pattern, html)
return emails
# Sahifadan elektron pochta manzillarini ajratish
emails = find_emails(html)
print("Topilgan elektron pochta manzillari:", emails)
Tahlil:
email_pattern – Elektron pochta manzillarini topish uchun regex (muntazam ifoda) shabloni.
re.findall(email_pattern, html) – HTML tarkibidan shablonga mos keluvchi barcha matnlarni ajratadi va emails ro‘yxatiga saqlaydi.
6 Ichki Havolalar (URL lar) orqali Sahifani Skanerlash
Quyidagi funksiya ichki havolalar yordamida saytdagi boshqa sahifalarni skanerlaydi va ularning tarkibini olish imkonini beradi.
def crawl_site(base_url, max_depth=2):
"""
Saytni ichki havolalar bo'yicha skanerlash funksiyasi.
"""
visited = set()
to_visit = [(base_url, 0)] # URL va chuqurlik darajasi
while to_visit:
url, depth = to_visit.pop(0)
if depth > max_depth or url in visited:
continue
print("Skanerlash:", url)
visited.add(url)
html = get_html(url)
if html is None:
continue
links = get_all_links(html)
for link in links:
if link.startswith('/'):
link = base_url + link # Nisbiy URL'larni to'liq URL'ga aylantirish
if link not in visited:
to_visit.append((link, depth + 1))
# Saytni skanerlash
crawl_site("https://example.com", max_depth=1)
Tahlil:
to_visit = [(base_url, 0)] – Skanerlash uchun tashrif buyurilmagan havolalar ro‘yxati (URL va chuqurlik darajasi).
if depth > max_depth or url in visited – Maksimal chuqurlikni oshmaslik yoki saytda takroriy havolalar orqali qaytmaslik.
get_all_links(html) – Sahifadagi barcha havolalarni topadi va nisbiy havolalarni to‘liq URL ga aylantiradi.
7 Fayllarni yuklab olish
Saytdagi ma’lum formatdagi fayllarni yuklab olish uchun foydalanish mumkin. Masalan, PDF fayllarni yuklab olish.
def download_files(url, file_extension="pdf"):
"""
Saytdan berilgan formatdagi fayllarni yuklab olish funksiyasi.
"""
html = get_html(url)
if html is None:
return
links = get_all_links(html)
for link in links:
if link.endswith(file_extension):
file_url = link if link.startswith("http") else url + link
file_name = file_url.split("/")[-1]
try:
print(f"Yuklanmoqda: {file_name}")
file_data = requests.get(file_url)
with open(file_name, "wb") as file:
file.write(file_data.content)
print(f"Yuklandi: {file_name}")
except requests.RequestException as e:
print(f"Yuklab olishda xatolik: {e}")
# Saytdan PDF fayllarni yuklab olish
download_files("https://example.com", "pdf")
Tahlil:
link.endswith(file_extension) – Faqat berilgan formatdagi fayllarni yuklab olish (masalan, .pdf).
file_url = link if link.startswith("http") else url + link – Nisbiy havolalarni to‘liq URL ga aylantirish.
with open(file_name, "wb") as file – Faylni binar formatda yozib olish.
8 To‘liq Dastur
Quyidagi dastur yuqoridagi barcha funksiyalarni birlashtirib, veb-saytni skanerlash va ma’lumotlarni yig‘ish bo‘yicha to‘liq amaliyotni bajaradi.
import requests
from bs4 import BeautifulSoup
import re
def get_html(url):
try:
response = requests.get(url)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"So'rovda xatolik yuz berdi: {e}")
return None
def get_page_title(html):
soup = BeautifulSoup(html, 'html.parser')
return soup.title.string if soup.title else "Noma'lum"
def get_all_links(html):
soup = BeautifulSoup(html, 'html.parser')
return [link['href'] for link in soup.find_all('a', href=True)]
def find_emails(html):
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
return re.findall(email_pattern, html)
def crawl_site(base_url, max_depth=2):
visited = set()
to_visit = [(base_url, 0)]
while to_visit:
url, depth = to_visit.pop(0)
if depth > max_depth or url in visited:
continue
print("Skanerlash:", url)
visited.add(url)
html = get_html(url)
if html is None:
continue
title = get_page_title(html)
emails = find_emails(html)
links = get_all_links(html)
print("Sahifa nomi:", title)
print("Topilgan elektron pochta manzillari:", emails)
for link in links:
if link.startswith('/'):
link = base_url + link
if link not in visited:
to_visit.append((link, depth + 1))
def download_files(url, file_extension="pdf"):
html = get_html(url)
if html is None:
return
links = get_all_links(html)
for link in links:
if link.endswith(file_extension):
file_url = link if link.startswith("http") else url + link
file_name = file_url.split("/")[-1]
try:
print(f"Yuklanmoqda: {file_name}")
file_data = requests.get(file_url)
with open(file_name, "wb") as file:
file.write(file_data.content)
print(f"Yuklandi: {file_name}")
except requests.RequestException as e:
print(f"Yuklab olishda xatolik: {e}")
# Saytni skanerlash va PDF fayllarni yuklab olish
base_url = "https://example.com"
crawl_site(base_url, max_depth=1)
download_files(base_url, "pdf")
Xulosa
Ushbu dastur veb-saytni skanerlash, ichki havolalarni ko‘rib chiqish, elektron pochta va fayllarni yuklab olish kabi amaliyotlarni bajaradi.
Natija bular real sahifada qilindi.Shuning uchun malumotlar ko'rsatilmadi.