Python_网页爬虫

程序员润杰技术 2022年11月16日

0 收藏 762 点赞 3,256 浏览 1369 个字

 import sys
 import multiprocessing
 import re
 import os
 import urllib.request as lib def craw_links( url,depth,keyword,processed):
     ''' url:the url to craw
         deth:the current depth to craw
         keyword:the tuple of keywords to focus
         pool:process pool
     '''     contents=[]
     if url.startswith(('htpp://','https://')):
         if url not in processed:
             #mark this url as processed
             processed.append(url)
         else:
             #avoid prossing the same url again
             return
         print('Crawing '+url+'...')
         fp = lib.urlopen(url)
         #python3 returns bytes,so need to decode
         contents = fp.read()
         contents_decoded = contents.decode('UTF-8')
         fp.close()
         pattern = '|'.join(keyword)
         #if this page contains certain keywords,save it to a file
         flag = False
         if pattern:
             searched = re.search(pattern,contents_decoded)
         else:
             #if the keywords to filter is not given,save current page
             flag = True
         if flag or searched:
             with open('craw\\'+url.replace(':','_').replace('/','_'),'wb')  as fp:
                 fp.write(contents)
         #find all the links in the current page
         links = re.findall('href="(.*?)" rel="external nofollow" ',contents_decoded)
         #craw all links in the current page
         for link in links:
             #consider the relative path
             if not link.startswith(('http://','https://')):
                 try:
                     index=url.rindex('/')
                     link = url[0:index+1]+link
                 except:
                     pass
             if depth>0 and link.endswith(('.htm','.html')):
                 craw_links(link,depth-1,keyword,processed) if __name__ == '__main__':
     processed = []
     keywords = ('KeyWord1','KeyWord2')
     if os.path.exists('craw') or not os.path.isdir('craw'):
         os.mkdir('craw')
     craw_links(r'http://docs.python.org/3/library/index.html',1,keywords,processed)