首页 技术 正文
技术 2022年11月19日
0 收藏 598 点赞 3,029 浏览 10810 个字
# -*- coding: utf-8 -*-# 爬虫分析
from bs4 import BeautifulSoup
from lxml import etree
from selenium import webdriver
import time
from pymongo import MongoClientclass WorkSpider:
def __init__(self):
self.client = MongoClient('mongodb://localhost:27017/')
self.zfdb = self.client.zfdb
#self.zfdb.authenticate("mongodbUser", "yourpassward") # 要爬取的城市列表
def getCity(self):
return [
"全国",
"北京",
"郑州",
#"上海",
#"深圳",
#"广州",
] # 要爬取的语言列表
def getLanguage(self):
return [
"Java",
"Python",
# "C",
# "机器学习",
# "图像识别",
# "自然语言处理",
# "区块链",
# "精准推荐",
# "Node.js",
# "Go",
# "Hadoop",
# "Php",
# ".NET",
# "Android",
# "iOS",
# "web前端",
] # 经过观察发现,拉钩的 url 随语言和城市的变化如下
def getUrl(self, language, city):
url = "https://www.lagou.com/jobs/list_" + language + "?px=default&city=" + city
return url # 获取一个城市,列表中所有语言的 url 列表
def getCityUrl(self, city):
urlList = []
for language in self.getLanguage():
urlList.append(self.getUrl(language, city))
return urlList # 获取一门语言,不同城市的 url 列表
def getLanguageUrl(self, language):
urlList = []
for city in self.getCity():
urlList.append(self.getUrl(language, city))
return urlList def getOnePageData(self): pass # MongoDB 存储数据结构
def getRentMsg(self, name, company, welfare, salaryMin, salaryMid, salaryMax, experience, education, companyType,
companyLevel, companySize):
return {
"name": name, # 职位名称(python工程师)
"company": company, # 公司名称(xxx有限公司)
"welfare": welfare, # 福利(餐补、下午茶、带薪年假)
"salaryMin": salaryMin, # 工资下限(9k)
"salaryMid": salaryMid, # 工资下限(9k+15k)/2
"salaryMax": salaryMax, # 工资上限(15k)
"experience": experience, # 工作经验(经验3-5年)
"education": education, # 教育程度(本科)
"companyType": companyType, # 公司类型(移动互联网/信息安全)
"companyLevel": companyLevel, # 公司级别(上市公司)
"companySize": companySize, # 公司人数规模(150-500人)
} # 获取网页源码数据
# language => 编程语言
# city => 城市
# collectionType => 值:True/False True => 数据库表以编程语言命名 False => 以城市命名
def main(self, language, city, collectionType):
print(" 当前爬取的语言为 => " + language + " 当前爬取的城市为 => " + city)
#print(" 当前爬取的语言为 => " + language + " 当前爬取的城市为 => " + city)
#print(" 当前爬取的语言为 => " + language + " 当前爬取的城市为 => " + city)
url = self.getUrl(language, city)
print(" 当前爬取的路径为 => " + url )
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--start-maximized') # 最大化运行(全屏窗口),不设置,取元素会报错
chrome_options.add_argument('--disable-infobars') # 禁用浏览器正在被自动化程序控制的提示
chrome_options.add_argument('--incognito') # 隐身模式(无痕模式)
#chrome_options.add_argument('--headless') # 浏览器不提供可视化页面
browser = webdriver.Chrome(executable_path = "chromedriver",options=chrome_options)
#browser = webdriver.Chrome("chromedriver")
browser.get(url)
browser.implicitly_wait(10)
for i in range(30):
selector = etree.HTML(browser.page_source) # 获取源码
soup = BeautifulSoup(browser.page_source, "html.parser")
span = soup.find("div", attrs={"class": "pager_container"}).find("span", attrs={"action": "next"})
print("span =>" + str(span)) # <span action="next" class="pager_next pager_next_disabled" hidefocus="hidefocus">下一页<strong class="pager_lgthen pager_lgthen_dis"></strong></span>
classArr = span['class']
print("classArr =>"+ str(classArr)) # 输出内容为 -> ['pager_next', 'pager_next_disabled']
attr2 = list(classArr)[1]
if attr2 == "pager_next_disabled":
print("已经爬到最后一页,爬虫结束")
break
else:
print("还有下一页,爬虫继续")
#browser.find_element_by_xpath('//*[@id="order"]/li/div[4]/div[2]').click() # 点击下一页
browser.find_element_by_xpath('//span[@class="pager_is_current"]/following-sibling::span').click() # 点击下一页
time.sleep(5)
print('第{}页抓取完毕'.format(i + 1))
self.getItemData(selector, language, city, collectionType)
browser.close() # 解析一条 item 数据,并存进数据库
def getItemData(self, selector, language, city, collectionType):
items = selector.xpath('//*[@id="s_position_list"]/ul/li')
for item in items:
try:
name = item.xpath('div[1]/div[1]/div[1]/a/h3/text()')[0]
company = item.xpath('div[1]/div[2]/div[1]/a/text()')[0]
welfare = item.xpath('div[2]/div[2]/text()')[0]
salaryArray = item.xpath('div[1]/div[1]/div[2]/div/span/text()')[0].strip().split("-")
salaryMin = salaryArray[0][:len(salaryArray[0]) - 1]
salaryMax = salaryArray[1][:len(salaryArray[1]) - 1]
salaryMid = (int(salaryMin) + int(salaryMax)) / 2
educationArray = item.xpath('div[1]/div[1]/div[2]/div//text()')[3].strip().split("/")
education = educationArray[0].strip()
experience = educationArray[1].strip()
conmpanyMsgArray = item.xpath('div[1]/div[2]/div[2]/text()')[0].strip().split("/")
companyType = conmpanyMsgArray[0].strip()
companyLevel = conmpanyMsgArray[1].strip()
companySize = conmpanyMsgArray[2].strip() data = self.getRentMsg(
name,
company,
welfare,
int(salaryMin),
salaryMid,
int(salaryMax),
experience,
education,
companyType,
companyLevel,
companySize
)
if collectionType:
self.zfdb["z_" + language].insert_one(data)
else:
self.zfdb["z_" + city].insert_one(data) print(data)
except:
print("======= exception =======")
continuespider = WorkSpider()# 职业爬虫
for language in spider.getLanguage():
for city in spider.getCity():
spider.main(language, city, True)
time.sleep(5)

Spider.py

以上是爬取功能的全部代码:

参考github上的源码修改:

主要步骤如下:

1、组装url

2、selenium爬取数据

3、存入数据库mongo

4、去广告:

        browser.get(url)        browser.implicitly_wait(10)        try:            browser.find_element_by_xpath(‘//div[@class=”body-container showData”]/div/div[2]’).click()  # 点击广告        except:            pass

———————————————————————————————————————————————————

分析数据:

# -*- coding: utf-8 -*-
# 数据分析,数据可视化
from os import path
from wordcloud import WordCloud, ImageColorGenerator
import jieba.analyse
import matplotlib.pyplot as plt
#from scipy.misc
import imageio
import os
import time
from pymongo import MongoClientclass Analycis:
def __init__(self):
self.client = MongoClient('mongodb://localhost:27017/')
self.zfdb = self.client.zfdb
#self.zfdb.authenticate("mongodbUser", "yourpassward") def getCity(self):
return [
"全国",
"北京",
"郑州",
#"上海",
#"深圳",
#"广州",
] def getLanguage(self):
return [
"Java",
"Python",
# "C",
# "机器学习",
# "图像识别",
# "自然语言",
# "区块链",
# "Go",
# "Php",
# ".NET",
# "Android",
# "iOS",
# "web前端",
# "精准推荐",
# "Node.js",
# "Hadoop", ] # 统计的数据量
# 各语言平均工资
# 各语言学历要求
# 各语言工作年限要求
# # 福利词云
# 公司级别排行(A轮、B轮)
# 公司类型排行 # 获取各语言样本数量
def getLanguageNum(self):
analycisList = []
for index, language in enumerate(self.getLanguage()):
collection = self.zfdb["z_" + language]
totalNum = collection.aggregate([{'$group': {'_id': '', 'total_num': {'$sum': 1}}}])
totalNum2 = list(totalNum)[0]["total_num"]
analycisList.append(totalNum2)
return (self.getLanguage(), analycisList) # 获取各语言的平均工资
def getLanguageAvgSalary(self):
analycisList = []
for index, language in enumerate(self.getLanguage()):
collection = self.zfdb["z_" + language]
totalSalary = collection.aggregate([{'$group': {'_id': '', 'total_salary': {'$sum': '$salaryMid'}}}])
totalNum = collection.aggregate([{'$group': {'_id': '', 'total_num': {'$sum': 1}}}])
totalNum2 = list(totalNum)[0]["total_num"]
totalSalary2 = list(totalSalary)[0]["total_salary"]
analycisList.append(round(totalSalary2 / totalNum2, 2))
return (self.getLanguage(), analycisList) # 获取一门语言的学历要求(用于 pyecharts 的词云)
def getEducation(self, language):
results = self.zfdb["z_" + language].aggregate([{'$group': {'_id': '$education', 'weight': {'$sum': 1}}}])
educationList = []
weightList = []
for result in results:
educationList.append(result["_id"])
weightList.append(result["weight"])
# print(list(result))
return (educationList, weightList) # 获取一门语言的工作年限要求(用于 pyecharts 的词云)
def getExperience(self, language):
results = self.zfdb["z_" + language].aggregate([{'$group': {'_id': '$experience', 'weight': {'$sum': 1}}}])
totalAvgPriceDirList = []
for result in results:
totalAvgPriceDirList.append(
{"value": result["weight"], "name": result["_id"] + " " + str(result["weight"])})
return totalAvgPriceDirList # 获取 welfare 数据,用于构建福利词云
def getWelfare(self):
content = ''
queryArgs = {}
projectionFields = {'_id': False, 'welfare': True} # 用字典指定
for language in self.getLanguage(): collection = self.zfdb["z_" + language]
searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000)
for result in searchRes:
print(result["welfare"])
content += result["welfare"]
return content # 获取公司级别排行(用于条形图)
def getAllCompanyLevel(self):
levelList = []
weightList = []
newWeightList = []
attrList = ["A轮", "B轮", "C轮", "D轮及以上", "不需要融资", "上市公司"]
for language in self.getLanguage():
collection = self.zfdb["z_" + language]
# searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000)
results = collection.aggregate([{'$group': {'_id': '$companyLevel', 'weight': {'$sum': 1}}}])
for result in results:
levelList.append(result["_id"])
weightList.append(result["weight"])
for index, attr in enumerate(attrList):
newWeight = 0
for index2, level in enumerate(levelList):
if attr == level:
newWeight += weightList[index2]
newWeightList.append(newWeight)
return (attrList, newWeightList) # ======================================================== # 展示饼图
def showPie(self, title, attr, value):
from pyecharts import Pie
pie = Pie(title)
# pie.add("aa", attr, value, is_label_show=True, title_pos='center')
pie.add("",
attr,
value,
radius=[40, 75],
label_text_color=None,
is_label_show=True,
legend_orient="vertical",
legend_pos="left", )
pie.render() # 展示矩形树图
def showTreeMap(self, title, data):
from pyecharts import TreeMap
data = data
treemap = TreeMap(title, width=1200, height=600)
treemap.add("深圳", data, is_label_show=True, label_pos='inside', label_text_size=19)
treemap.render() # 展示条形图
def showLine(self, title, attr, value):
from pyecharts import Bar
bar = Bar(title)
bar.add("深圳", attr, value, is_convert=False, is_label_show=True, label_text_size=18, is_random=True,
xaxis_interval=0,
# xaxis_label_textsize=9,
legend_text_size=18, label_text_color=["#000"])
bar.render() # 展示词云
def showWorkCloud(self, content, image_filename, font_filename, out_filename):
d = path.dirname(__name__)
# content = open(path.join(d, filename), 'rb').read()
# 基于TF-IDF算法的关键字抽取, topK返回频率最高的几项, 默认值为20, withWeight
# 为是否返回关键字的权重
tags = jieba.analyse.extract_tags(content, topK=100, withWeight=False)
text = " ".join(tags)
# 需要显示的背景图片
img = imageio.imread(path.join(d, image_filename))
# 指定中文字体, 不然会乱码的
wc = WordCloud(font_path=font_filename,
background_color='black',
# 词云形状,
mask=img,
# 允许最大词汇
max_words=500,
# 最大号字体,如果不指定则为图像高度
max_font_size=130,
# 画布宽度和高度,如果设置了msak则不会生效
# width=600,
# height=400,
margin=2,
# 词语水平摆放的频率,默认为0.9.即竖直摆放的频率为0.1
prefer_horizontal=0.9
)
wc.generate(text)
img_color = ImageColorGenerator(img)
plt.imshow(wc.recolor(color_func=img_color))
wc.to_file("loutput.jpeg")
plt.axis("off")
plt.show()
wc.to_file(path.join(d, out_filename)) # 展示 pyecharts 的词云
def showPyechartsWordCloud(self, attr, value):
from pyecharts import WordCloud
wordcloud = WordCloud(width=1300, height=620)
wordcloud.add("", attr, value, word_size_range=[20, 100])
wordcloud.render()analycis = Analycis()# 计算样本数量
(attr, value) = analycis.getLanguageNum()
analycis.showLine("样本数量", attr, value)
os.rename("render.html","sampleNum.html")# 计算样本数量
(attr, value) = analycis.getLanguageAvgSalary()
analycis.showLine("各语言平均工资", attr, value)
os.rename("render.html","languageAvgSalary.html")# 语言学历要求
for language in analycis.getLanguage():
(attr, value) = analycis.getEducation(language)
print(attr, value)
analycis.showPie(" "+language + " 工作年限", attr, value)
os.rename("render.html", "./languageEducation/" + language + "Education.html")
## 语言工作年限要求要求
for language in analycis.getLanguage():
data = analycis.getExperience(language)
print(data)
analycis.showTreeMap(" "+language+"工作学历要求", data)
os.rename("render.html", "./languageExperience/" + language + "Experience.html")# 福利词云
analycis.showWorkCloud(analycis.getWelfare(), "docker.jpeg", "kh.ttf", out_filename="loutput.jpeg")# 公司级别(A轮、B轮) pyechart 词云
(attr, value) = analycis.getAllCompanyLevel()
print(attr, value)
analycis.showLine("公司级别", attr, value)
os.rename("render.html", "companyLevel.html")

分析

相关推荐
python开发_常用的python模块及安装方法
adodb:我们领导推荐的数据库连接组件bsddb3:BerkeleyDB的连接组件Cheetah-1.0:我比较喜欢这个版本的cheeta…
日期:2022-11-24 点赞:878 阅读:9,071
Educational Codeforces Round 11 C. Hard Process 二分
C. Hard Process题目连接:http://www.codeforces.com/contest/660/problem/CDes…
日期:2022-11-24 点赞:807 阅读:5,549
下载Ubuntn 17.04 内核源代码
zengkefu@server1:/usr/src$ uname -aLinux server1 4.10.0-19-generic #21…
日期:2022-11-24 点赞:569 阅读:6,397
可用Active Desktop Calendar V7.86 注册码序列号
可用Active Desktop Calendar V7.86 注册码序列号Name: www.greendown.cn Code: &nb…
日期:2022-11-24 点赞:733 阅读:6,174
Android调用系统相机、自定义相机、处理大图片
Android调用系统相机和自定义相机实例本博文主要是介绍了android上使用相机进行拍照并显示的两种方式,并且由于涉及到要把拍到的照片显…
日期:2022-11-24 点赞:512 阅读:7,809
Struts的使用
一、Struts2的获取  Struts的官方网站为:http://struts.apache.org/  下载完Struts2的jar包,…
日期:2022-11-24 点赞:671 阅读:4,889