搜索贴吧内容，摘取其标题、发帖人、发帖时间、评论数，并保存到数据库

5年以前 | 阅读数：869 次 | 编程语言：Python

'''
针对贴吧前5页（可改）实现功能：
1、保存所查询的网页内容到文件
2、摘取每个帖子的属性信息（标题，发帖人，发帖时间，评论数），并保存到数据库中
3.根据标题从数据库中搜索帖子
'''
from urllib.request import urlopen
from urllib.parse import urlencode
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import sqlite3
import os

key = input('请输入一个查询关键字')
#key = '芙蓉'
args = {
    'kw': key,
    'ie': 'utf-8'
}
url1 = 'http://tieba.baidu.com/f?' + urlencode(args)

def get_one_page(index):
    url = url1 + '&pn={}'.format(index * 50)
    response = urlopen(url)
    return response.read().decode()

def save_one_page(index, html):
    filename = 'tieba\\tieba_{}.html'.format(index + 1)
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(html)
    pass

db_file = 'tieba.db'
def create_table():
    conn = sqlite3.connect(db_file)#1连接数据库
    cursor = conn.cursor()#2创建执行对象
    cursor.execute('''
       create table tieba(
          id integer primary  key  autoincrement ,
          title text,
          author  text,
          time text,
          num int
       )
    ''')#3
    conn.commit()#4.提交操作,对于可以修改数据库内容的语句必须提交
    conn.close()#5.关闭连接

def save(tieba):
    #连接
    conn = sqlite3.connect(db_file)
    #创建执行对象
    cursor = conn.cursor()
    #执行SQL语句
    cursor.execute('''
        insert into tieba
        (title,author,time,num)
        values
        (?, ?, ?, ?)
    ''',(tieba.get('title'),tieba.get('author'),tieba.get('time'),
         tieba.get('num')))
    #提交
    conn.commit()
    #关闭
    conn.close()


# 根据标题关键字查询数据库
def find_by_title(key):
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    result = cursor.execute('''
       select * from tieba
       where title like ?
    ''', ('%'+key+'%',))
    #查询不需要提交
    ls = []
    for row in result:
        movie = {}
        movie['id'] = row[0]
        movie['title'] = row[1]
        movie['auther'] = row[2]
        movie['time'] = row[3]
        movie['num'] = row[4]

        ls.append(movie)

    conn.close()
    return  ls

def get_tieba_info(html):

    soup = BeautifulSoup(html, 'html.parser')

    ls_con = soup.select('#thread_list li')
    print(len(ls_con))

    ls = []#定义一个空列表，用来存放贴吧的信息

    for con in ls_con:
        tieba = {}
        a = con.find('a', attrs={"class": 'j_th_tit'})
        # print(a)
        if a == None:
            continue
        else:
            title = a.get('title')
            # print(title)
            tieba['title'] = title

        p = con.find('span', attrs={'class': 'tb_icon_author'})
        author = p.get('title')
        # print(author)
        author = author.replace('\n', '')  # 去掉字符串结尾的\n
        author = author.replace('主题作者:', '')  # 去掉字符串中的多余字符
        # print(author)
        tieba['author'] = author

        p = con.find('span', attrs={'class': 'pull-right'})
        # print(p)
        time = p.get_text()
        # print(time)
        tieba['time'] = time

        p = con.find('span', attrs={'class': 'threadlist_rep_num'})
        # print(p)
        num = p.get_text()
        # print(num)
        tieba['num'] = num

        ls.append(tieba)
    return ls


if __name__ == '__main__':
    if not os.path.exists(db_file):#若已经存在就不再创建新表
       create_table()
    tieba_list = []
    '''
    #仅保存第一页的代码
    html = get_one_page(0)
    get_tieba_info(html)
    tieba_list += get_tieba_info(html)
    print(tieba_list)
    '''

   #保存到数据库，最后要测试
    for index in range(0, 5):
        html = get_one_page(index)
        tieba_list += get_tieba_info(html)
        # 保存网页到文件
        save_one_page(index, html)

    #把数据保存到表中
    for t in tieba_list:
           save(t)

    key = input('请输入一个关键词')
    ls = find_by_title(key)
    for t in ls:
        print(t)