Last active
November 7, 2021 03:17
-
-
Save sdycgtgz/5daf42fad079473a412ed25383891da2 to your computer and use it in GitHub Desktop.
python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| from selenium import webdriver | |
| from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
| import xlwt | |
| url = 'http://hotel.meituan.com/huangshan/' | |
| #获取酒店分页信息,返回最大页码 | |
| def get_page_num(url): | |
| html = requests.get(url).text | |
| soup = BeautifulSoup(html,'lxml') | |
| page_info = soup.find_all('li',class_='page-link') #获取酒店首页的页面导航条信息 | |
| page_num = page_info[-1].find('a').get_text() #获取酒店页面的总页数 | |
| return int(page_num) #返回酒店页面的总页数 | |
| #获取所有酒店详细信息,包含酒店名称,链接,地址,评分,消费人数,价格,上次预定时间 | |
| def get_hotel_info(url): | |
| dcap = dict(DesiredCapabilities.PHANTOMJS) | |
| dcap['phantomjs.page.settings.userAgent'] = ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36') #设置userAgent,可以从浏览器中找到,用于反爬虫禁止IP | |
| browser = webdriver.PhantomJS("/Users/chenglv/phantomjs-2.1.1-macosx/bin/phantomjs", desired_capabilities=dcap) #指定phantomjs程序路径 | |
| browser.get(url) | |
| hotel_info = {} | |
| hotel_id = ['酒店名','网址','酒店地址','评价','消费人数','价格','上次预约时间'] | |
| col_num = 1 | |
| page_num = 1 | |
| book = xlwt.Workbook(encoding='utf-8',style_compression=0) #创建excel文件 | |
| sheet = book.add_sheet('hotel_info',cell_overwrite_ok=True) #创建excel sheet表单 | |
| for i in range(len(hotel_id)): #写入表单第一行,即列名称 | |
| sheet.write(0,i,hotel_id[i]) #excel中写入第一行列名 | |
| while(page_num < get_page_num(url)+1): | |
| #获取一个页面的所有酒店信息 | |
| for item in browser.find_elements_by_class_name('info-wrapper'): | |
| hotel_info['name'] = item.find_element_by_class_name('poi-title').text | |
| hotel_info['link'] = item.find_element_by_class_name('poi-title').get_attribute('href') | |
| hotel_info['address'] = item.find_element_by_class_name('poi-address').text.split(' ')[1] | |
| hotel_info['star'] = item.find_element_by_class_name('poi-grade').text | |
| hotel_info['consumers'] = item.find_element_by_class_name('poi-buy-num').text | |
| hotel_info['price'] = item.find_element_by_class_name('poi-price').text | |
| hotel_info['last_order_time'] = item.find_element_by_class_name('last-order-time').text | |
| #将当前页面中的酒店信息获取到后,写入excel的行中 | |
| for i in range(len(hotel_info.values())): | |
| sheet.write(col_num,i,list(hotel_info.values())[i]) | |
| col_num+=1 | |
| browser.find_element_by_class_name('paginator').find_element_by_class_name('next').find_element_by_tag_name('a').click() #一个页面写完后,通过点击"下一页"图标至下一页,继续获取 | |
| page_num += 1 | |
| book.save('hotel_info_huangshan.csv') | |
| def main(): | |
| get_hotel_info(url) | |
| if '__main__' == __name__: | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment