2022/6/16 23:23:31
''' 常见问题 1.网速问题,有时候加载不出页面,需要盯着,下滑有时候也没数据 2.滑动验证码 ''' import datetime import re import time def time_turn(timenum): if 0 < len((timenum)) < 11 and timenum.isdigit(): timenum = int(timenum) timeArray = time.localtime(timenum) otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) return otherStyleTime else: print('请输入11位以内的数字') def today_start(): today = datetime.date.today() today_time = int(time.mktime(today.timetuple())) return today_time def time_turns(time1): time1 = str(time1).replace('发布', '').replace('發布', '') if time1[0] == '昨' and len(time1) > 2: time1 = time1.split('發佈')[0] time1 = (time1.split('天')[-1]) time1 = (today_start() - 24 * 3600) + int(time1.split(':')[0]) * 3600 + int(time1.split(':')[1]) * 60 # print(time1) return time1 if time1 == '昨天': time1 = (int(time.time()) - 24 * 3600) return time1 if time1 == '今天更新': time1 = (int(time.time())) return time1 if time1 == '刚刚': time1 = int(time.time()) return time1 if '天前' in time1: time1 = int(time.time()) - (int(time1.split('天')[0]) * 3600 * 24) return time1 try: try: # 1小时转年月日 TTime = time.time() try: xs = int(time1.split('小时')[0]) except: xs = int(time1.split('小時')[0]) sjc = xs * 60 * 60 time1 = int(TTime - sjc) # print(time1) return time1 except: try: TTime = time.time() try: xs = int(time1.split('分钟')[0]) except: xs = int(time1.split('分鐘')[0]) sjc = xs * 60 time1 = int(TTime - sjc) # print(time1) return time1 except: TTime = time.time() xs = int(time1.split('天')[0]) sjc = xs * 60 * 60 * 24 time1 = int(TTime - sjc) # print(time1) return time1 except: if time1[1] == '月': if len(re.findall('(.*?)月', time1)) == 1: time1 = time1.replace('月', '-').replace('日', ' ') if ':' in time1: try: time1 = '2022-' + time1 + ':00' time1 = time1.replace(' :', ':') except: time1 = '2022-0' + time1 + ':00' time1 = time1.replace(' :', ':') else: try: time1 = '2022-' + time1 + '00:00:00' time1 = time1.replace(' :', ':') except: time1 = '2022-0' + time1 + '00:00:00' time1 = time1.replace(' :', ':') else: time1 = time1.replace('月', '-').replace('日', ' ') time1 = '2022-' + time1 + '00:00:00' time1 = time1.replace(' :', ':') dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S') # result从数据库中读出来的标准格式时间数据 # # 10位,时间点相当于从1.1开始的当年时间编号 time1 = int(str(int(time.mktime(dt.timetuple())))) # print(time1) return time1 elif '2022年' in time1: time1 = time1.replace('年', '-').replace('月', '-').replace('日', ' ') time1 = time1 + ':00' time1 = time1.replace(' :', ':') dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S') # result从数据库中读出来的标准格式时间数据 # # 10位,时间点相当于从1.1开始的当年时间编号 time1 = int(str(int(time.mktime(dt.timetuple())))) return time1 elif time1.split('年')[0] != 2022: time1 = 0 print('不是今年的数据,不采集') return time1 time1 = time_turn(time_turns(time1)) return time1 import random import pandas as pd from selenium import webdriver from lxml import etree import time url = 'https://www.tiktok.com/@xiaoqiww' driver = webdriver.Chrome() driver.get(url=url) time.sleep(5) for page in range(1, 3): time.sleep(random.randint(3, 5)) print(f'********************第{page}页******************') driver.execute_script('window.scrollBy(0,2200)') html = driver.page_source tree = etree.HTML(html) second_url = tree.xpath('//div[@class="tiktok-yz6ijl-DivWrapper e1cg0wnj1"]//a//@href') print(len(second_url)) names = [] publishtimes = [] contents = [] loves = [] comments = [] shares = [] second_urls = [] sums = 0 for second_url in second_url: time.sleep(5) print(second_url) second_urls.append(second_url) driver.get(second_url) html2 = driver.page_source tree2 = etree.HTML(html2) # 姓名 name = tree2.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[1]/div[1]/a[2]/h3//text()')[0] names.append(name) # 发布时间 publishtime = tree2.xpath( '//div[@data-e2e="recommend-list-item-container"][1]//a[@class="tiktok-1lqhxf7-StyledAuthorAnchor emt6k1z1"]//text()')[ -1] publishtime = time_turn(str(time_turns(publishtime))) publishtimes.append(publishtime) # 内容 content = tree2.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[1]/div[2]//text()')[0] contents.append(content) # 点赞 love = tree2.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[2]/div[2]/button[1]/strong//text()')[0] love = int(love) loves.append(love) # 评论 comment = tree2.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[2]/div[2]/button[2]/strong//text()')[0] comment = int(comment) comments.append(comment) # 转发 share = tree2.xpath('/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/div/div[2]/div[2]/button[3]/strong//text()')[0] share = ''.join(share).replace('分享', '0') share = int(share) shares.append(share) # print(publishtime) data = { '详情页链接': second_urls, '姓名': names, '发布时间': publishtimes, '内容': contents, '点赞': loves, '评论': comments, '转发': shares, } print(data) s = pd.DataFrame(data=data) s.to_excel('tiktok.xlsx') print('保存成功') driver.quit()
- 2024-11-23Springboot应用的多环境打包入门
- 2024-11-23Springboot应用的生产发布入门教程
- 2024-11-23Python编程入门指南
- 2024-11-23Java创业入门:从零开始的编程之旅
- 2024-11-23Java创业入门:新手必读的Java编程与创业指南
- 2024-11-23Java对接阿里云智能语音服务入门详解
- 2024-11-23Java对接阿里云智能语音服务入门教程
- 2024-11-23JAVA对接阿里云智能语音服务入门教程
- 2024-11-23Java副业入门:初学者的简单教程
- 2024-11-23JAVA副业入门:初学者的实战指南