PHP
·
发表于 6年以前
·
阅读量:8498
'''
针对贴吧前5页(可改)实现功能:
1、保存所查询的网页内容到文件
2、摘取每个帖子的属性信息(标题,发帖人,发帖时间,评论数),并保存到数据库中
3.根据标题从数据库中搜索帖子
'''
from urllib.request import urlopen
from urllib.parse import urlencode
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import sqlite3
import os
key = input('请输入一个查询关键字')
#key = '芙蓉'
args = {
'kw': key,
'ie': 'utf-8'
}
url1 = 'http://tieba.baidu.com/f?' + urlencode(args)
def get_one_page(index):
url = url1 + '&pn={}'.format(index * 50)
response = urlopen(url)
return response.read().decode()
def save_one_page(index, html):
filename = 'tieba\\tieba_{}.html'.format(index + 1)
with open(filename, 'w', encoding='utf-8') as file:
file.write(html)
pass
db_file = 'tieba.db'
def create_table():
conn = sqlite3.connect(db_file)#1连接数据库
cursor = conn.cursor()#2创建执行对象
cursor.execute('''
create table tieba(
id integer primary key autoincrement ,
title text,
author text,
time text,
num int
)
''')#3
conn.commit()#4.提交操作,对于可以修改数据库内容的语句必须提交
conn.close()#5.关闭连接
def save(tieba):
#连接
conn = sqlite3.connect(db_file)
#创建执行对象
cursor = conn.cursor()
#执行SQL语句
cursor.execute('''
insert into tieba
(title,author,time,num)
values
(?, ?, ?, ?)
''',(tieba.get('title'),tieba.get('author'),tieba.get('time'),
tieba.get('num')))
#提交
conn.commit()
#关闭
conn.close()
# 根据标题关键字查询数据库
def find_by_title(key):
conn = sqlite3.connect(db_file)
cursor = conn.cursor()
result = cursor.execute('''
select * from tieba
where title like ?
''', ('%'+key+'%',))
#查询不需要提交
ls = []
for row in result:
movie = {}
movie['id'] = row[0]
movie['title'] = row[1]
movie['auther'] = row[2]
movie['time'] = row[3]
movie['num'] = row[4]
ls.append(movie)
conn.close()
return ls
def get_tieba_info(html):
soup = BeautifulSoup(html, 'html.parser')
ls_con = soup.select('#thread_list li')
print(len(ls_con))
ls = []#定义一个空列表,用来存放贴吧的信息
for con in ls_con:
tieba = {}
a = con.find('a', attrs={"class": 'j_th_tit'})
# print(a)
if a == None:
continue
else:
title = a.get('title')
# print(title)
tieba['title'] = title
p = con.find('span', attrs={'class': 'tb_icon_author'})
author = p.get('title')
# print(author)
author = author.replace('\n', '') # 去掉字符串结尾的\n
author = author.replace('主题作者:', '') # 去掉字符串中的多余字符
# print(author)
tieba['author'] = author
p = con.find('span', attrs={'class': 'pull-right'})
# print(p)
time = p.get_text()
# print(time)
tieba['time'] = time
p = con.find('span', attrs={'class': 'threadlist_rep_num'})
# print(p)
num = p.get_text()
# print(num)
tieba['num'] = num
ls.append(tieba)
return ls
if __name__ == '__main__':
if not os.path.exists(db_file):#若已经存在就不再创建新表
create_table()
tieba_list = []
'''
#仅保存第一页的代码
html = get_one_page(0)
get_tieba_info(html)
tieba_list += get_tieba_info(html)
print(tieba_list)
'''
#保存到数据库,最后要测试
for index in range(0, 5):
html = get_one_page(index)
tieba_list += get_tieba_info(html)
# 保存网页到文件
save_one_page(index, html)
#把数据保存到表中
for t in tieba_list:
save(t)
key = input('请输入一个关键词')
ls = find_by_title(key)
for t in ls:
print(t)