Code Here.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | from bs4 import BeautifulSoup import requests import time list_url = ['http://bj.xiaozhu.com/zhengzu-duanzufang-{}/' .format(str(i)) for i in range(1,5,1)] page_url = [] #性别验证 def get_sex(class_name): if class_name == ['member_ico']: return '男' else: return '女' #获取详情页地址 def get_page_url(url): web_data = requests.get(url) time.sleep(1) soup = BeautifulSoup(web_data.text, 'lxml') links = soup.select('ul.pic_list > li > a[target="_blank"]') for link in links: href = link.get('href') get_content(href) #获取详情页信息 def get_content(url) : page_data = requests.get(url) time.sleep(1) data = BeautifulSoup(page_data.text, 'lxml') titles = data.select('div.pho_info > h4 > em') addresses = data.select('span[class="pr5"]') fees = data.select('div.day_l > span') images = data.select('img[id="curBigImage"]') avatars = data.select('div.member_pic > a > img') sexs = data.select('div.member_pic > div') names = data.select('a[class="lorder_name"]') for title, address, fee, image, avatar, sex, name in zip (titles, addresses, fees, images, avatars,sexs, names): data = { 'title' : title.get_text(), 'address' : address.get_text(), 'fee' : fee.get_text(), 'image' : image.get('src'), 'avatar' : avatar.get('src'), 'sex' : get_sex(sex.get('class')), 'name' : name.get_text(), } print(data) #爬取开始 for list in list_url: get_page_url(list) |