MagicScraper

Scrapes and displays data from the web page based on rules.

当前为 2023-07-20 提交的版本,查看 最新版本

此脚本不应直接安装。它是供其他脚本使用的外部库,要使用该库请加入元指令 // @require https://update.gf.qytechs.cn/scripts/471264/1222840/MagicScraper.js

  1. // ==UserScript==
  2. // @name MagicScraper
  3. // @namespace http://tampermonkey.net/
  4. // @version 0.1
  5. // @description Scrapes and displays data from the web page based on rules.
  6. // @author aolko
  7. // @match *://*/*
  8. // @grant GM_addStyle
  9. // ==/UserScript==
  10.  
  11. function magicScraper(rules, options = {}) {
  12. let scrapedData = {}; // Variable to store the scraped data
  13. function createDOMFromScrapedData(data, element, keepChildren) {
  14. for (const key in data) {
  15. if (Array.isArray(data[key])) {
  16. data[key].forEach(item => {
  17. const newElement = document.createElement(element.tagName);
  18. newElement.innerHTML = item;
  19.  
  20. if (keepChildren && element.children.length > 0) {
  21. Array.from(element.children).forEach(child => {
  22. newElement.appendChild(child);
  23. });
  24. }
  25.  
  26. element.appendChild(newElement);
  27. });
  28. } else if (typeof data[key] === 'string') {
  29. const newElement = document.createElement(element.tagName);
  30. newElement.innerHTML = data[key];
  31.  
  32. if (keepChildren && element.children.length > 0) {
  33. Array.from(element.children).forEach(child => {
  34. newElement.appendChild(child);
  35. });
  36. }
  37.  
  38. element.appendChild(newElement);
  39. } else if (typeof data[key] === 'object') {
  40. const newElement = document.createElement(element.tagName);
  41. element.appendChild(newElement);
  42. createDOMFromScrapedData(data[key], newElement, keepChildren);
  43. }
  44. }
  45. }
  46.  
  47. function matchPageOrDomain(pattern, current) {
  48. const escapedPattern = pattern.replace(/\./g, '\\.');
  49. const regex = new RegExp(`^${escapedPattern.replace('*', '.*')}$`, 'i');
  50. return regex.test(current);
  51. }
  52.  
  53. function scrapeDataByRules(rulesObj, currentDomain, currentPage) {
  54. const domainKeys = Object.keys(rulesObj);
  55. let domainData = {};
  56. let pageData = {};
  57.  
  58. for (const domainPattern of domainKeys) {
  59. if (matchPageOrDomain(domainPattern, currentDomain)) {
  60. domainData = rulesObj[domainPattern];
  61. break;
  62. }
  63. }
  64.  
  65. if (currentPage && domainData.pages) {
  66. pageData = domainData.pages[currentPage] || {};
  67. }
  68.  
  69. return Object.assign({}, domainData, pageData);
  70. }
  71.  
  72. function loadExternalRules(externalRulesURL, currentDomain, currentPage, callback) {
  73. fetch(externalRulesURL)
  74. .then(response => response.json())
  75. .then(data => {
  76. const rulesObj = data.rules || {};
  77. const scrapedData = scrapeDataByRules(rulesObj, currentDomain, currentPage);
  78. callback(scrapedData);
  79. })
  80. .catch(err => {
  81. console.error('Error loading external rules:', err);
  82. callback({});
  83. });
  84. }
  85.  
  86. function runScraping() {
  87. const currentDomain = window.location.hostname;
  88. const currentPage = window.location.pathname;
  89.  
  90. let pageRules;
  91.  
  92. if (typeof rules === 'string') {
  93. // Load external rules if the rules parameter is a URL string
  94. loadExternalRules(rules, currentDomain, currentPage, scrapedData => {
  95. pageRules = scrapedData;
  96. handleRules(pageRules);
  97. });
  98. } else {
  99. pageRules = scrapeDataByRules(rules, currentDomain, currentPage);
  100. handleRules(pageRules);
  101. }
  102. }
  103.  
  104. function handleRules(pageRules) {
  105. if (Object.keys(pageRules).length === 0) {
  106. console.warn('No rules found for the current domain and page.');
  107. return;
  108. }
  109.  
  110. const fragment = document.createDocumentFragment();
  111. const temporaryData = {};
  112.  
  113. // Helper function to process nested rules
  114. function processNestedRules(rules, currentElement) {
  115. for (const key in rules) {
  116. const selector = rules[key];
  117.  
  118. if (typeof selector === 'string') {
  119. // Handle single selector
  120. const elements = currentElement.querySelectorAll(selector);
  121. temporaryData[key] = Array.from(elements).map(element => {
  122. return {
  123. text: element.textContent,
  124. html: element.innerHTML,
  125. };
  126. });
  127. } else if (typeof selector === 'object') {
  128. // Handle nested rules recursively
  129. temporaryData[key] = [];
  130. const nestedElements = currentElement.querySelectorAll(key);
  131. nestedElements.forEach(nestedElement => {
  132. temporaryData[key].push({});
  133. processNestedRules(selector, nestedElement);
  134. });
  135. }
  136. }
  137. }
  138.  
  139. // Process the top-level rules
  140. processNestedRules(pageRules, document);
  141.  
  142. createDOMFromScrapedData(temporaryData, fragment, options.keepChildren);
  143.  
  144. if (options.replaceBody) {
  145. document.body.innerHTML = '';
  146. document.body.appendChild(fragment);
  147. }
  148.  
  149. // Store the scraped data in the variable
  150. scrapedData = temporaryData; // Update the correct variable with the scraped data
  151. }
  152. runScraping();
  153. // Expose the scraped data object for further use
  154. return scrapedData;
  155. }

QingJ © 2025

镜像随时可能失效,请加Q群300939539或关注我们的公众号极客氢云获取最新地址