PHP
·
发表于 6年以前
·
阅读量:8496
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import sqlite3
import os
def get_one_page(index):
url = 'https://maoyan.com/board/4?offset=%d' % (index * 10)
# 另一种方式: url = 'https://maoyan.com/board/4?offset={}/'.format(index * 10)
# 字符串的格式化处理 {} 占位符 使用format方式格式化字符串
response = urlopen(url)
return response.read().decode()
db_file = 'maoyan.db'
def create_table():
conn = sqlite3.connect(db_file)#1连接数据库
cursor = conn.cursor()#2创建执行对象
cursor.execute('''
create table movie(
id integer primary key autoincrement ,
title text,
star text,
reltime text,
country text,
score float
)
''')#3
conn.commit()#4.提交操作,对于可以修改数据库内容的语句必须提交
conn.close()#5.关闭连接
def save(movie):
#连接
conn = sqlite3.connect(db_file)
#创建执行对象
cursor = conn.cursor()
#执行SQL语句
cursor.execute('''
insert into movie
(title, star, reltime, country, score)
values
(?, ?, ?, ?, ?)
''',(movie.get('title'),movie.get('star'),movie.get('time'),
movie.get('country'),movie.get('score')))
#提交
conn.commit()
#关闭
conn.close()
# 根据标题关键字查询数据库
def find_by_title(key):
conn = sqlite3.connect(db_file)
cursor = conn.cursor()
result = cursor.execute('''
select * from movie
where title like ?
''', ('%'+key+'%',))
#查询不需要提交
ls = []
for row in result:
movie = {}
movie['id'] = row[0]
movie['title'] = row[1]
movie['star'] = row[2]
movie['time'] = row[3]
movie['country'] = row[4]
movie['score'] = row[5]
ls.append(movie)
conn.close()
return ls
if __name__ == '__main__':
if not os.path.exists(db_file):#若已经存在就不再创建新表
create_table()
#保存
#movie = {'title': '霸王别姬', 'star': '张国荣,张丰毅,巩俐', 'time': '1993-01-01', 'score': 9.6}
#save(movie)
print(find_by_title('王'))
'''
在项目下会出现一个maoyan.db文件
1) PyCharm 右侧点击Database
2) 点击 + 弹出菜单中, 选择Data Source, 再选择sqlite
3) 如果有 Download missing driver files, 点击 Download
4) 选择 file 选择需要打开的数据库文件
5) 能不能展开, maoyan.db
'''
'''
解析贴吧列表,保存到数据库中'''
#html = get_one_page(0)
def get_movie_info(html):
soup = BeautifulSoup(html, 'html.parser')
ls_dd = soup.select('.board-wrapper dd')
ls = []#定义一个空列表,用来存放电影的信息
for dd in ls_dd:
movie = {}
a = dd.find('a', attrs={"class": 'image-link'})
title = a.get('title')
#print(title)
movie['title'] = title
p = dd.find('p', attrs={'class': 'star'})
star = p.get_text()
#print(star)
# 正则表达式
# print(type(star))
star = star.replace('\n', '')#去掉字符串结尾的\n
# print(star)
regx = '^.*?:(.*?)\s'
res = re.match(regx, star)
#print(res.group(1))
movie['star'] = res.group(1)
p = dd.find('p', attrs={'class': 'releasetime'})
release_time = p.get_text()
#regx = '^.*?:(\d+-\d+-\d).*'
regx = '^.*?:([0-9-]+).*'
res = re.match(regx, release_time)
#print(res.group(1))
movie['time'] = res.group(1)
regx = '^.*?\((.*)\)'
res = re.match(regx,release_time)
if res:
#print(res.group(1))
movie['country'] = res.group(1)
p = dd.find('p',attrs={'class' :'score'})
i_int = p.find('i',attrs={'class':'integer'})
i_fra = p.find('i',attrs={'class': 'fraction'})
score = i_int.get_text() + i_fra.get_text()
#print(score)
movie['score'] = float(score)
ls.append(movie)
return ls
if __name__ == '__main__':
movie_list = []
for index in range(0,10):
html = get_one_page(index)
movie_list += get_movie_info(html)
print(movie_list)
#使用数据库保存数据
print(len(movie_list))
for movie in movie_list:
save(movie)
key = input('请输入一个关键词')
ls = find_by_title(key)
for movie in ls :
print(movie)