python爬取试题信息-保存文本并利用正则表达式获取指定字段并保存mysql数据库

python爬取试题信息-保存文本并对文本进行正则表达式处理

  • 欢迎留言交流学习(小白记录学习历程,有错误欢迎指正。🚀🚀🚀)
    • 去除网页里获取时候遇到的脏数据
    • 运行初步结果如下:
    • 对文本进行正则匹配获取文本指定字段并保存到数据库

欢迎留言交流学习(小白记录学习历程,有错误欢迎指正。🚀🚀🚀)

🚔疫情期间无聊突发奇想想要做一个在线考试系统,目前已完成数据库设计,开始编写爬虫爬取试题数据,目标网站如下,获取内容包括:考点,试题,答案选项,答案,解析。考点字段的获取便于以后系统个性化推荐的需要。🚔

在这里插入图片描述在这里插入图片描述

去除网页里获取时候遇到的脏数据

🚑查看网页的时候发现这个东西,可能是他们后台有其他用途,由于直接匹配字段不方便,先把所有网页文本先获取再把class为this_jammer等中内容获取为停用词表,爬取的试题文本去掉这些脏数据就OK了。🚑
在这里插入图片描述

1 🏎下面保存脏数据表的函数。🏎 2 3

在这里插入图片描述

1🚜具体代码如下:🚜 2 3
1#-*-coding:utf-8-*- 2import requests 3from bs4 import BeautifulSoup 4# import codecs 5def get_url(target_url, server, headers): 6 req = requests.get(target_url, headers=headers) 7 bf = BeautifulSoup(req.text) 8 div = bf.find_all('div', class_='questions_col') 9 a_bf = BeautifulSoup(str(div[0])) 10 a = a_bf.find_all('a') 11 cheak_parsing_url = [] 12 for each in a: 13 if each.string == "查看解析": 14 full_url = server + each.get('href') 15 cheak_parsing_url.append(full_url) 16 print(cheak_parsing_url) 17 return cheak_parsing_url 18 19def change_page(target_url, server, headers): 20 req = requests.get(target_url, headers=headers) 21 bf = BeautifulSoup(req.text) 22 div = bf.find_all('div', class_='fenye') 23 a_bf = BeautifulSoup(str(div[0])) 24 a = a_bf.find_all('a') 25 full_url = None 26 for each in a: 27 if each.string == "下一页": 28 full_url = server + each.get('href') 29 print(full_url) 30 else : 31 continue 32 return full_url 33 34def get_html(url_list, file_path, headers): 35 for url in url_list: 36 req = requests.get(url, headers=headers) 37 content = req.content.decode('utf-8','ignore') 38 bf = BeautifulSoup(content, fromEncoding="gb18030") 39 del_text = bf.find_all(class_=["this_jammer", "hidejammersa", "jammerd42"]) 40 for i in del_text: 41 if i: 42 new_tag = "" 43 try: 44 i.string.replace_with(new_tag) 45 except: 46 pass 47 texts = bf.find_all('div', class_= 'answer_detail') 48 try: 49 texts = texts[0].text.replace('\xa0', '') 50 texts = texts.replace(" ", "") 51 except: 52 pass 53 try: 54 texts = texts.replace("\n", '') 55 except: 56 pass 57 print(texts) 58 contents_save(file_path, texts) 59 60def contents_save(file_path, content): 61 """ 62 :param file_path: 爬取文件保存路径 63 :param content: 爬取文本文件内容 64 :return: None 65 """ 66 with open(file_path, 'a', encoding="utf-8", errors='ignore') as f: 67 try: 68 f.write(content) 69 except: 70 pass 71 f.write('\n') 72 73def get_category(target_url, server, headers): 74 req = requests.get(target_url, headers=headers) 75 bf = BeautifulSoup(req.text) 76 div = bf.find_all('div', class_='shiti_catagory frame') 77 a_bf = BeautifulSoup(str(div[0])) 78 a = a_bf.find_all('a') 79 category = [] 80 for each in a: 81 full_url = server + each.get('href') 82 category.append(full_url) 83 print(category) 84 return category 85 86if __name__ == "__main__": 87 main_url = "https://tiku.21cnjy.com/tiku.php?mod=quest&channel=8&xd=3" 88 server = "https://tiku.21cnjy.com/" 89 save_dir = "/Users/lidongliang/Desktop/爬虫/data" 90 subject_file = "1.txt" 91 file_path = save_dir + '/' + subject_file 92 headers = { 93 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36', 94 'Accept-Encoding': 'gzip'} 95 categorys = get_category(main_url, server, headers) 96 for category_url in categorys: 97 counting = 0 98 target_url = category_url 99 while counting < 100: 100 cheak_parsing_url = get_url(target_url, server, headers) 101 get_html(cheak_parsing_url, file_path, headers) 102 target_url = change_page(target_url, server, headers) 103 104 if target_url == None: 105 break 106 counting += 1 107 108

运行初步结果如下:

对文本进行正则匹配获取文本指定字段并保存到数据库

1代码块如下: 2 3
1import re 2import pymysql 3 4w1 = 'A.' 5w2 = 'B.' 6w3 = 'C.' 7w4 = 'D.' 8w5 = '答案' 9w6 = '解析试题分析:' 10w7 = '考点' 11 12 13def get_txt(): 14 with open("/Users/lidongliang/Desktop/爬虫/data/1.txt", "r") as f: 15 txt = f.readlines() 16 return txt 17 18 19def fen(txt): 20 # buff = txt.replace('\n','') 21 timu = re.compile('^' + '(.*?)' + w1, re.S).findall(txt) 22 A = re.compile(w1 + '(.*?)' + w2, re.S).findall(txt) 23 B = re.compile(w2 + '(.*?)' + w3, re.S).findall(txt) 24 C = re.compile(w3 + '(.*?)' + w4, re.S).findall(txt) 25 D = re.compile(w4 + '(.*?)' + w5, re.S).findall(txt) 26 daan = re.compile(w5 + '(.*?)' + w6, re.S).findall(txt) 27 jiexi = re.compile(w6 + '(.*?)' + w7, re.S).findall(txt) 28 kaodian = re.compile(w7 + '(.*?)' + '\Z', re.S).findall(txt) 29 30 timu.extend(A) 31 timu.extend(B) 32 timu.extend(C) 33 timu.extend(D) 34 timu.extend(daan) 35 timu.extend(jiexi) 36 timu.extend(kaodian) 37 38 # print(timu) 39 40 try: 41 tg = timu[0] 42 xx = ("A:" + timu[1] + "B:" + timu[2] + "C:" + timu[3] + "D:" + timu[4]) 43 da = timu[5] 44 fx = timu[6] 45 kd = timu[7] 46 except: 47 tg = '1' 48 xx = '1' 49 da = '1' 50 fx = '1' 51 kd = '1' 52 con = pymysql.connect(host='localhost', user='root', passwd='00000000', db='login_test_1', charset='utf8') 53 cursor = con.cursor() 54 sql = "insert into question_info(tg,xx,da,fx,kd) values('%s','%s','%s','%s','%s')" \ 55 % (tg, xx, da, fx, kd) 56 cursor.execute(sql) 57 con.commit() 58 59 60if __name__ == "__main__": 61 txt = get_txt() 62 for i in txt: 63 fen(i) 64 print("done") 65 66
1最后结果: 2 3

在这里插入图片描述

代码交流 2021