Douyin Crawler csv_v3.0

capture statistics of an author

  1. // ==UserScript==
  2. // @name Douyin Crawler csv_v3.0
  3. // @namespace http://tampermonkey.net/
  4. // @version v3.0_20241203
  5. // @description capture statistics of an author
  6. // @author qmcc
  7. // @match https://www.douyin.com/user/*
  8. // @icon https://www.google.com/s2/favicons?sz=64&domain=douyin.com
  9. // @grant none
  10. // @license MIT
  11. // ==/UserScript==
  12.  
  13. (function() {
  14. 'use strict';
  15.  
  16. const posts = [];
  17. let hasMore = true;
  18. const author = {};
  19.  
  20. function scrollToBottom() {
  21. window.scrollTo(0, document.body.scrollHeight);
  22. }
  23.  
  24. setInterval(() => {
  25. if (hasMore) {
  26. console.log('Scrolling to bottom');
  27. scrollToBottom();
  28. }
  29.  
  30. }, 100);
  31.  
  32. function parseHasMore(response) {
  33. if (response.has_more === 1) {
  34. hasMore = true;
  35. } else {
  36. hasMore = false;
  37. }
  38. }
  39.  
  40. function parsePost(aweme) {
  41. // 视频信息统计
  42. // title: 视频标题
  43. const post = {};
  44. post.title = aweme.desc;
  45. // createDatetime: 发布日期
  46. post.createDatetime = new Date(aweme.create_time * 1000);
  47. // likeCount: 点赞数
  48. post.likeCount = aweme.statistics.digg_count;
  49. // shareCount: 转发数
  50. post.shareCount = aweme.statistics.share_count;
  51. // commentCount: 评论数
  52. post.commentCount = aweme.statistics.comment_count;
  53. // collectCount: 收藏数
  54. post.collectCount = aweme.statistics.collect_count;
  55. // duration: 时长
  56. post.duration = aweme.duration / 1000;
  57. posts.push(post);
  58. }
  59.  
  60.  
  61.  
  62. function parseAuthor(profile) {
  63. // 作者信息收集
  64. // 用户主页
  65. author.url = window.location.href
  66.  
  67. // nickname: 昵称
  68. author.nickname = profile.user.nickname
  69. // id: 抖音号
  70. author.id = profile.user.unique_id || profile.user.short_id;
  71. // favoratedCount: 获赞数
  72. author.favoratedCount = profile.user.total_favorited;
  73. // followerCount: 粉丝数
  74. author.followerCount = profile.user.follower_count;
  75. // followingCount: 关注数
  76. author.followingCount = profile.user.following_count;
  77. // favoritingCount: 喜欢数
  78. author.favoritingCount = profile.user.favoriting_count;
  79. // gender: 性别
  80. author.gender = profile.user.gender == 1 ? '男' : '女';
  81. // age: 年龄
  82. author.age = profile.user.user_age > 0 ? profile.user.user_age : null;
  83. // ipLocation: IP属地
  84. author.ipLocation = profile.user.ip_location.replace('IP属地:', '');
  85. // province: 省份
  86. author.province = profile.user.province;
  87. // city: 城市
  88. author.city = profile.user.city;
  89. // postCount: 发布视频数
  90. author.postCount = profile.user.aweme_count;
  91. // hasShop: 是否有橱窗
  92. author.hasShop = profile.user.with_fusion_shop_entry;
  93. // hasLiveCommerce: 是否有直播带货?
  94. author.hasLiveCommerce = profile.user.live_commerce;
  95. // signature: 个性签名
  96. author.signature = profile.user.signature;
  97. // withCommerceEnterpriseTabEntry
  98. author.withCommerceEnterpriseTabEntry = profile.user.with_commerce_enterprise_tab_entry;
  99. // withCommerceEntry
  100. author.withCommerceEntry = profile.user.with_commerce_entry;
  101. // withNewGoods
  102. author.withNewGoods = profile.user.with_new_goods;
  103. // youtubeChannelId
  104. author.youtubeChannelId = profile.user.youtube_channel_id;
  105. // youtubeChannelTitle
  106. author.youtubeChannelTitle = profile.user.youtube_channel_title;
  107. // showFavoriteList: 是否展示喜欢列表
  108. author.showFavoriteList = profile.user.show_favorite_list;
  109. // showSubscription: 是否展示关注列表
  110. author.showSubscription = profile.user.show_subscription;
  111. // isActivityUser: 是否活跃用户
  112. author.isActivityUser = profile.user.is_activity_user;
  113. // isBan: 是否被封禁
  114. author.isBan = profile.user.is_ban;
  115. // isBlock: 是否被拉黑
  116. author.isBlock = profile.user.is_block;
  117. // isBlocked
  118. author.isBlocked = profile.user.is_blocked;
  119. // isEffectArtist: 是否是特效艺术家
  120. author.isEffectArtist = profile.user.is_effect_artist;
  121. // isGovMediaVip: 是否是政府媒体VIP
  122. author.isGovMediaVip = profile.user.is_gov_media_vip;
  123. // isMixUser: 是否是混合用户
  124. author.isMixUser = profile.user.is_mix_user;
  125. // isNotShow: 是否不展示
  126. author.isNotShow = profile.user.is_not_show;
  127. // isSeriesUser: 是否是系列用户
  128. author.isSeriesUser = profile.user.is_series_user;
  129. // isSharingProfileUser: 是否是分享资料用户
  130. author.isSharingProfileUser = profile.user.is_sharing_profile_user;
  131. // isStar: 是否是明星
  132. author.isStar = profile.user.is_star;
  133. // isoCountryCode: 国家代码
  134. author.isoCountryCode = profile.user.iso_country_code;
  135. // customVerify: 自定义认证
  136. author.customVerify = profile.user.custom_verify;
  137. // hasMcn: 是否有MCN机构
  138. author.hasMcn = (profile.user.account_info_url && profile.user.account_info_url.includes('mcn')) || false;
  139. // groupChatCount: 群聊数量
  140. author.groupChatCount = 0;
  141. if (profile.user.card_entries) {
  142. const groupChatEntry = profile.user.card_entries.find(entry => entry.sub_title && entry.sub_title.includes('群聊'));
  143. if (groupChatEntry) {
  144. const match = groupChatEntry.sub_title.match(/(\d+)个群聊/);
  145. if (match) {
  146. author.groupChatCount = parseInt(match[1]);
  147. }
  148. }
  149. }
  150. }
  151.  
  152. let dataFrame = [];
  153.  
  154. function clearDataFrame() {
  155. dataFrame = [];
  156. }
  157.  
  158. function addEntryToDataFrame(header, content) {
  159. // if content is string, replace newline with space
  160. if (typeof content === 'string') {
  161. content = content.replace(/\n/g, ' ');
  162. content = content.replace(/\t/g, ' ');
  163. }
  164. dataFrame.push({header, content});
  165. }
  166.  
  167. function addAuthorToDataFrame() {
  168. addEntryToDataFrame('用户主页', author.url);
  169. addEntryToDataFrame('ID (抖音号)', author.id);
  170. addEntryToDataFrame('Nickname (昵称)', author.nickname);
  171. addEntryToDataFrame('Favorated (获赞数)', author.favoratedCount);
  172. addEntryToDataFrame('Follower (粉丝数)', author.followerCount);
  173. addEntryToDataFrame('Following (关注数)', author.followingCount);
  174. addEntryToDataFrame('Favoriting (喜欢数)', author.favoritingCount);
  175. addEntryToDataFrame('Gender (性别)', author.gender);
  176. addEntryToDataFrame('Age (年龄)', author.age);
  177. addEntryToDataFrame('IP Location (IP属地)', author.ipLocation);
  178. addEntryToDataFrame('Province (省份)', author.province);
  179. addEntryToDataFrame('City (城市)', author.city);
  180. addEntryToDataFrame('Post Count (发布视频数)', author.postCount);
  181. addEntryToDataFrame('Has Shop (是否有橱窗)', author.hasShop);
  182. addEntryToDataFrame('Has Live Commerce (是否有直播带货)', author.hasLiveCommerce);
  183. addEntryToDataFrame('Signature (个性签名)', author.signature);
  184. addEntryToDataFrame('With Commerce Enterprise Tab Entry', author.withCommerceEnterpriseTabEntry);
  185. addEntryToDataFrame('With Commerce Entry', author.withCommerceEntry);
  186. addEntryToDataFrame('With New Goods', author.withNewGoods);
  187. addEntryToDataFrame('Youtube Channel ID', author.youtubeChannelId);
  188. addEntryToDataFrame('Youtube Channel Title', author.youtubeChannelTitle);
  189. addEntryToDataFrame('Show Favorite List (是否展示喜欢列表)', author.showFavoriteList);
  190. addEntryToDataFrame('Show Subscription (是否展示关注列表)', author.showSubscription);
  191. addEntryToDataFrame('Is Activity User (是否活跃用户)', author.isActivityUser);
  192. addEntryToDataFrame('Is Ban (是否被封禁)', author.isBan);
  193. addEntryToDataFrame('Is Block (是否被拉黑)', author.isBlock);
  194. addEntryToDataFrame('Is Blocked', author.isBlocked);
  195. addEntryToDataFrame('Is Effect Artist (是否是特效艺术家)', author.isEffectArtist);
  196. addEntryToDataFrame('Is Gov Media VIP (是否是政府媒体VIP)', author.isGovMediaVip);
  197. addEntryToDataFrame('Is Mix User (是否是混合用户)', author.isMixUser);
  198. addEntryToDataFrame('Is Not Show (是否不展示)', author.isNotShow);
  199. addEntryToDataFrame('Is Series User (是否是系列用户)', author.isSeriesUser);
  200. addEntryToDataFrame('Is Sharing Profile User (是否是分享资料用户)', author.isSharingProfileUser);
  201. addEntryToDataFrame('Is Star (是否是明星)', author.isStar);
  202. addEntryToDataFrame('ISO Country Code (国家代码)', author.isoCountryCode);
  203. addEntryToDataFrame('Custom Verify (自定义认证)', author.customVerify);
  204. addEntryToDataFrame('Has MCN (是否有MCN机构)', author.hasMcn);
  205. addEntryToDataFrame('Group Chat Count (群聊数量)', author.groupChatCount);
  206. }
  207.  
  208. function addPostToDataFrame(title, post) {
  209. addEntryToDataFrame(title + '-Datetime (发布日期)', post.createDatetime.toLocaleDateString());
  210. addEntryToDataFrame(title + '-Like (点赞数)', post.likeCount);
  211. addEntryToDataFrame(title + '-Share (转发数)', post.shareCount);
  212. addEntryToDataFrame(title + '-Comment (评论数)', post.commentCount);
  213. addEntryToDataFrame(title + '-Collect (收藏数)', post.collectCount);
  214. }
  215.  
  216. function getStatistics() {
  217. clearDataFrame();
  218.  
  219. // 作者信息
  220. addAuthorToDataFrame();
  221.  
  222. // 视频平均长度
  223. const averageDuration = posts.reduce((acc, post) => acc + post.duration, 0) / posts.length;
  224. addEntryToDataFrame('Average Duration (平均时长)', averageDuration);
  225.  
  226. // 视频中位数长度
  227. const durations = posts.map(post => post.duration);
  228. durations.sort((a, b) => a - b);
  229. const medianDuration = durations[Math.floor(durations.length / 2)];
  230. addEntryToDataFrame('Median Duration (中位数时长)', medianDuration);
  231.  
  232. // 视频前10%长度
  233. const percentile10Duration = durations[Math.floor(durations.length * 0.9)];
  234. addEntryToDataFrame('10% Duration (前10%时长)', percentile10Duration);
  235.  
  236. // 视频后10%长度
  237. const percentile90Duration = durations[Math.floor(durations.length * 0.1)];
  238. addEntryToDataFrame('90% Duration (后10%时长)', percentile90Duration);
  239.  
  240. // 最近一周发布的视频数量
  241. const oneWeekAgo = new Date();
  242. oneWeekAgo.setDate(oneWeekAgo.getDate() - 7);
  243. const lastWeekPostsCount = posts.filter(post => post.createDatetime > oneWeekAgo).length;
  244. addEntryToDataFrame('Last Week Posts Count (最近一周发布数)', lastWeekPostsCount);
  245.  
  246. // Top3热门视频信息
  247. const top3HotPosts = posts.sort((a, b) => b.likeCount - a.likeCount).slice(0, 3);
  248. top3HotPosts.forEach((post, index) => {
  249. addPostToDataFrame(`Hot${index + 1}`, post);
  250. });
  251.  
  252. // Latest3最新视频信息
  253. const latest3Posts = posts.sort((a, b) => b.createDatetime - a.createDatetime).slice(0, 3);
  254. latest3Posts.forEach((post, index) => {
  255. addPostToDataFrame(`Latest${index + 1}`, post);
  256. });
  257.  
  258. // Oldest3最早视频信息
  259. const oldest3Posts = posts.sort((a, b) => a.createDatetime - b.createDatetime).slice(0, 3);
  260. oldest3Posts.forEach((post, index) => {
  261. addPostToDataFrame(`Oldest${index + 1}`, post);
  262. });
  263.  
  264. // 生成表头
  265. const headers = dataFrame.map(entry => entry.header);
  266.  
  267. // 生成内容
  268. const content = dataFrame.map(entry => entry.content);
  269.  
  270. return [headers, content];
  271. }
  272.  
  273. function writeHeadersToClipboard() {
  274. // Excel tab-separated format
  275. const headers = getStatistics()[0];
  276. const text = headers.join('\t');
  277. navigator.clipboard.writeText(text);
  278. alert('表头已复制到剪贴板');
  279. }
  280.  
  281. function writeContentToClipboard() {
  282. // Excel tab-separated format
  283. const content = getStatistics()[1];
  284. const text = content.join('\t');
  285. navigator.clipboard.writeText(text);
  286. if (hasMore) {
  287. alert('内容已复制到剪贴板,还有更多数据,请继续滚动页面');
  288. } else {
  289. alert('内容已复制到剪贴板,数据已全部加载完毕');
  290. }
  291. }
  292.  
  293. console.log('Douyin Crawler is running');
  294.  
  295. function findDivByInnerText(text) {
  296. return Array.from(document.querySelectorAll('div')).find(div => div.innerText === text);
  297. }
  298.  
  299. setInterval(() => {
  300. const feedback = findDivByInnerText('意见反馈');
  301. if (feedback) {
  302. const newFeedback = feedback.cloneNode(true);
  303. newFeedback.innerText = '复制内容';
  304. newFeedback.onclick = writeContentToClipboard;
  305. feedback.parentNode.appendChild(newFeedback);
  306. feedback.remove();
  307. }
  308. const faq = findDivByInnerText('常见问题');
  309. if (faq) {
  310. const newFaq = faq.cloneNode(true);
  311. newFaq.innerText = '复制表头';
  312. newFaq.onclick = writeHeadersToClipboard;
  313. faq.parentNode.appendChild(newFaq);
  314. faq.remove();
  315. }
  316. }, 1000);
  317.  
  318. function convertToCSV(headers, content) {
  319. // Combine headers and content into a CSV string
  320. const csvRows = [ content.join(',')+"\n"];
  321. return csvRows.join('\n');
  322. }
  323.  
  324. function downloadCSV(filename, csvContent) {
  325. // Create a Blob with the CSV content
  326. const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' });
  327. const url = URL.createObjectURL(blob);
  328. const link = document.createElement('a');
  329. link.href = url;
  330. link.download = filename;
  331. document.body.appendChild(link);
  332. link.click();
  333. document.body.removeChild(link);
  334. }
  335.  
  336. function exportDataToCSV() {
  337. const [headers, content] = getStatistics();
  338. const csvContent = convertToCSV(headers, content);
  339. downloadCSV('douyin_data.csv', csvContent);
  340. alert('CSV 文件已生成并下载!');
  341. }
  342.  
  343. // 增加一个按钮,用于触发 CSV 导出
  344. function createExportButton() {
  345. const button = document.createElement('button');
  346. button.textContent = '导出为 CSV';
  347. button.style.position = 'fixed';
  348. button.style.bottom = '270px';
  349. button.style.right = '20px';
  350. button.style.padding = '10px';
  351. button.style.backgroundColor = '#ff5722';
  352. button.style.color = '#fff';
  353. button.style.border = 'none';
  354. button.style.borderRadius = '5px';
  355. button.style.cursor = 'pointer';
  356. button.addEventListener('click', exportDataToCSV);
  357. document.body.appendChild(button);
  358. }
  359.  
  360. // 调用按钮创建函数
  361. createExportButton();
  362.  
  363.  
  364.  
  365.  
  366. // Hijack XMLHttpRequest
  367. var open = XMLHttpRequest.prototype.open;
  368. XMLHttpRequest.prototype.open = function(method, url, async, user, pass) {
  369. this.addEventListener('readystatechange', function() {
  370. if (this.readyState === 4 && url.includes('/post')) {
  371. const response = JSON.parse(this.responseText);
  372. parseHasMore(response);
  373. const awemeList = response.aweme_list;
  374. awemeList.forEach(aweme => {
  375. parsePost(aweme);
  376. });
  377. console.log('Posts:', posts);
  378. } else if (this.readyState === 4 && url.includes('/profile/other')) {
  379. const response = JSON.parse(this.responseText);
  380. parseAuthor(response);
  381. console.log('Author:', author);
  382. }
  383. }, false);
  384. open.call(this, method, url, async, user, pass);
  385. };
  386.  
  387.  
  388. })();

QingJ © 2025

镜像随时可能失效,请加Q群300939539或关注我们的公众号极客氢云获取最新地址