DxmCrawl

2024/6/5 10:03:47

此脚本不应直接安装。它是供其他脚本使用的外部库,要使用该库请加入元指令 // @require https://update.gf.qytechs.cn/scripts/497140/1389534/DxmCrawl.js

  1. // ==UserScript==
  2. // @name DxmCrawl
  3. // @namespace .dianxiaomi.com
  4. // @grant none
  5. // @version 1.0.0
  6. // @author -
  7. // @description 2024/6/5 10:03:47
  8. // ==/UserScript==
  9.  
  10. var dataArr = [];//存放重复的采集数据
  11. var contentObj = $("#repeatCrawlModalContent .dxmRepeatBox");
  12. var jj = 0,failsUrls="";
  13.  
  14. //临时获取该账户下的采集失败的数据,然后再从本地缓存里根据UID取出之前的失败数据,去重后合并到采集助手里显示
  15. var failData = {
  16. uid: null,//登录(不可用)的用户id
  17. data: [], //所有采集的分类
  18. crawlSuccessArr: [],//采集成功的id
  19. isFinish: false //是否采集完成状态
  20. };
  21. //拼多多跨境批量采集计算次数
  22. var temuBatchCrawlNum = 0;
  23.  
  24. //判断obj是不是空的 空的返回true,不空返回false;
  25. var objectIsEmpty = function(e) {
  26. var t;
  27. for (t in e)
  28. return !1;
  29. return !0;
  30. };
  31.  
  32. //选中
  33. $(document).off('click', 'input[name="categoryCheck"]').on('click', 'input[name="categoryCheck"]', function () {
  34. var check = $(this).is(':checked');
  35. $("#categoryNum").prop('disabled',!check);
  36. $("#categoryNum").val(check ? '50' :'');
  37. });
  38. $(document).off('input', '#categoryNum').on('input', '#categoryNum', function () {
  39. var ipt = $(this);
  40. var objRegExp = /[^0-9]/;
  41. if(objRegExp.test(ipt.val())){
  42. ipt.val(ipt.val().replace(/[^\d]/g,""));
  43. }
  44. });
  45. $(document).off('click', '#checkFailUrl').on('click', '#checkFailUrl', function () {
  46. $('#dxmCopyUrl').select();
  47. document.execCommand('copy');
  48. $.fn.message({type: 'success',msg: '复制失败链接成功'});
  49. });
  50.  
  51. var Crawl = {
  52. config:{
  53. "single":[
  54. {"taobao.com" : TaobaoCrawl},
  55. {"1688.com" : ALiBabaCrawl},
  56. {"tmall.com": TmallCrawl},
  57. {"aliexpress.com" : SmtCrawl},
  58. {"aliexpress.us" : SmtCrawl},
  59. {"ebay" : EbayCrawl},
  60. {"amazon" : AmazonComCrawl},
  61. {"jd.com" : JDCrawl},
  62. {"alibaba.com" : AlibabaGjCrawl},
  63. {"dhgate.com" : DhgateCrawl},
  64. {"etsy.com" : EtsyCrawl},
  65. {"lazada" : LazadaCrawl},
  66. {"pfhoo.com" : PfhooCrawl},
  67. {"www.banggood.com" : BanggoodCrawl},
  68. {"sea.banggood.com" : BanggoodCrawl},
  69. {"chinavasion.com" : Chinavasion},
  70. {"gearbest.com" : GearbestCrawl},
  71. {"walmart.com" : WalmartCrawl},
  72. // {"walmart.ca" : WalmartCrawl},
  73. {"ebay.com" : EbayCrawl},
  74. {"ebay.co.uk":EbayCrawl},
  75. {"ebay.ca":EbayCrawl},
  76. {"ebay.de":EbayCrawl},
  77. {"ebay.fr":EbayCrawl},
  78. {"amazon.com" : AmazonCrawl},
  79. {"amazon.co.jp" : AmazonCrawl},
  80. {"amazon.cn":AmazonCrawl},
  81. {"amazon.co.uk" : AmazonCrawl},
  82. {"amazon.ca":AmazonCrawl},
  83. {"amazon.com.mx":AmazonCrawl},
  84. {"amazon.de":AmazonCrawl},
  85. {"amazon.fr":AmazonCrawl},
  86. {"us.banggood.com":BanggoodCrawl},
  87. {"usa.banggood.com":BanggoodCrawl},
  88. {"chinabrands.com" : ChinabrandsCrawl},
  89. {"chinabrands.cn":ChinabrandsCrawl},
  90. {"wish.com":WishCrawl},
  91. {"joom.com":JoomCrawl},
  92. {"tophatter.com":TophatterCrawl},
  93. {"haiyingshuju.com":HaiYingShuJuCrawl},
  94. {"yixuanpin.cn":YiXuanPinCrawl},
  95. {"shopee.":ShopeeCrawl},
  96. {"tw.shopeesz.com":ShopeeCrawl},
  97. {"xiapibuy.com":ShopeeCrawl},
  98. {"pifa.pinduoduo.com":PinduoduoCrawl},
  99. {"yangkeduo.com":YangkeduoCrawl},
  100. {"mobile.pinduoduo.com":YangkeduoCrawl},
  101. {"vova.com":VovaCrawl},
  102. {"17zwd.com":Zwd17Crawl},
  103. {"vvic.com":VvicCrawl},
  104. /* {"k3.cn":K3cnCrawl},
  105. {"bao66.cn":Bao66Crawl},*/
  106. {"17huo.com":Huo17Crawl},
  107. /*{"xingfujie.cn":XingfujieCrawl},*/
  108. {"chinavasion.com":ChinavasionCrawl},
  109. {"sooxie.com":SooxieCrawl},
  110. {"571xz.com":Xz571Crawl},
  111. {"kaola.com":KaolaCrawl},
  112. {"https://distributor.taobao.global/apps/product/detail" : TaobaoGlobalCrawl},
  113. /* {"2tong.cn" : twoTongCrawl},*/
  114. {"made-in-china.com" : madeInChinaCrawl},
  115. {"cjdropshipping.com" : cjDropshippingCrawl},
  116. {"qksource.com" : cjDropshippingCrawl},
  117. {"yiwugo.com" : yiWuGoCrawl},
  118. {"go2.cn" : go2Crawl},
  119. {".mercadolibre." : mercadoCrawl},
  120. {".mercadolivre.com.br/" : mercadoCrawl},
  121. {".temu.com/" : temuCrawl},
  122. {".coupang.com/" : coupangCrawl},
  123. {".ozon.com/" : ozonCrawl},
  124. {".ozon.ru/" : ozonCrawl},
  125. {".shein.com/" : sheinCrawl},
  126. {".daraz." : darazCrawl},
  127. {".wsy.com/" : wsyCrawl},
  128. {".91jf.com/": jfCrawl},
  129. {".fruugo." : fruggoCrawl},
  130. {".17qcc.com/": qcCrawl},
  131. {".wildberries.ru/": wildCrawl},
  132. {".tiktok.com/": tiktokCrawl},
  133. {".tokopedia.com/": tiktokCrawl},
  134. {".miravia.es/": miraviaCrawl},
  135. {".gigab2b.": gigaCrawl},
  136. // {".target.com/" : targetCrawl}
  137. ],
  138. "category":[
  139. {"taobao.com" : TaobaoCategoryCrawl},
  140. {"tmall.com" : TmallCategoryCrawl},
  141. {"1688.com" : ALiBaBaCategoryCrawl},
  142. {"ebay" : EbayCategoryCrawl},
  143. {"ebay.com" : EbayCategoryCrawl},
  144. {"ebay.co.uk":EbayCategoryCrawl},
  145. {"ebay.ca":EbayCategoryCrawl},
  146. {"ebay.de":EbayCategoryCrawl},
  147. {"ebay.fr":EbayCategoryCrawl},
  148. {"amazon.com" : AmazonCategoryCrawl},
  149. {"aliexpress.com" : SmtCategoryCrawl},
  150. {"aliexpress.us" : SmtCategoryCrawl}
  151. ]
  152. },
  153.  
  154. categoryCrawlTotalNum: 0, //分类采集的产品总数
  155. categoryCrawlCountNum: 0, //分类采集已经采集的数量
  156. categoryDataList: [], //分类采集需要采集的产品数据集合
  157.  
  158. checkUrl: function(type, url){
  159. var urlArr = url.toLowerCase().split("\n");
  160. var errorArr = [];
  161. for(var i in urlArr){
  162. url = urlArr[i];
  163. if (!url || !url.trim()){
  164. continue;
  165. }
  166. var b = false,
  167. isAlibabaCategory = false,
  168. isTmallCategory = false,
  169. errorMsg = '采集地址无效,或不支持该网址采集。',
  170. errorMsg_1688 = '不支持输入1688链接进行分类采集,请进入1688产品页面,点击右下角分类采集按钮进行采集',
  171. errorMsg_Tmall = '天猫链接暂不支持使用该方式采集,请打开采集网址点击右下角采集按钮采集 ';
  172. for (var j in Crawl.config[type]) {
  173. for (var key in Crawl.config[type][j]) {
  174. if (key.indexOf('&-&') > -1) {
  175. var num = 0,
  176. amzKey = key.split('&-&');
  177. for (var k in amzKey) {
  178. if (url.indexOf(amzKey[k]) > -1) num++;
  179. }
  180. if (num === amzKey.length) {
  181. b = true;
  182. break;
  183. }
  184. } else {
  185. //如果是天猫采集,不管是单品还是分类采集都不支持采集,提示打开链接页面采集
  186. //天猫采集先在TmallCrawl.crawl方法里面把item.htm替换为item_o.htm老页面去采集,这里的判断先注释,以后如果老页面不存在了在放开
  187. // if (key === 'tmall.com' && url.indexOf('tmall.com') > -1) {
  188. // isTmallCategory = true;
  189. // break;
  190. // }
  191. if (type === 'category' && key === '1688.com' && url.indexOf('1688.com/') > -1) {
  192. // 分类采集移除对1688的支持
  193. isAlibabaCategory = true;
  194. break;
  195. } else if (url.indexOf(key) > -1) {
  196. b = true;
  197. break;
  198. }
  199. }
  200. }
  201. if (b) break;
  202. }
  203. //!b && errorArr.push("此地址暂不支持:" + url);
  204. if (!b){
  205. if (isAlibabaCategory) {
  206. errorArr.push(errorMsg_1688);
  207. } else if (isTmallCategory) {
  208. errorArr.push(errorMsg_Tmall);
  209. } else {
  210. errorArr.push(errorMsg);
  211. }
  212. }
  213. }
  214. return errorArr;
  215. },
  216.  
  217. getCrawlObject : function(type, url){
  218. for(var j in Crawl.config[type]){
  219. for(var key in Crawl.config[type][j]){
  220. var chKey = key;
  221. if(chKey.indexOf("&-&") > -1){
  222. chKey = key.split("&-&")[0];
  223. }
  224. if(url.indexOf(chKey) > -1){
  225. return Crawl.config[type][j][key];
  226. }
  227. }
  228. }
  229. return null;
  230. },
  231.  
  232. //拼多多跨境批量采集请求加延时处理方法封装
  233. temuBatchAjaxTimeOut: function(data, crawlCategory, checkFinish, total, timeOut) {
  234. timeOut += temuBatchCrawlNum * 4000;//控制每4s请求一次
  235. if (temuBatchCrawlNum && temuBatchCrawlNum % 10 === 0) timeOut += 5000;//每请求十次再延时累加5s
  236. temuBatchCrawlNum++;
  237. var ajaxTimeOut = setTimeout(function () {
  238. clearTimeout(ajaxTimeOut);
  239. ajaxTimeOut = null;
  240. Html.postHtml(URL_MANAGER.url.postHTML(), data, 0, function(result){
  241. if (+result.code === -10) {$.fn.message({type:'danger',msg:result.msg})}
  242. if (+result.code === -11) {$.fn.message({type:'danger',msg:result.msg})}
  243. var objCrawl = result.repeatCrawlProduct,
  244. isRepeat = 0,
  245. html = '';
  246.  
  247. if (objCrawl) isRepeat = objCrawl.repeatCrawl;
  248. if (!crawlCategory){
  249. if (+isRepeat === 1){//重复,生成td,并记录重复的data
  250. dataArr.push(data);
  251.  
  252. html = '<tr><td style="width:50px;">' +
  253. '<input name="sourceUrlRepeat" type="checkbox" value="' + data.url + '"/></td>' +
  254. '<td style="width:80px;"><div class="imgDivOut"><div class="imgDivIn">' +
  255. '<img src="' + objCrawl.imgUrl.split('|')[0] +
  256. '" class="imgCss" width="71px" height="71px"/></div></div></td>' +
  257. '<td style="text-align:left;"><a href="' + data.url +
  258. '" target="_blank">' + objCrawl.name + '</a></td>' +
  259. '<td style="width:80px;">' + (objCrawl.price !== null ? objCrawl.price : '') + '</td></tr>';
  260.  
  261. contentObj.append(html);
  262. $('#repeatCrawlModal').find('input[name = "sourceUrlRepeat"]').prop('checked', false);
  263. if (checkFinish && total === temuBatchCrawlNum){//表示采集完成,显示重复采集记录模态层
  264. dxmModal.hide("#crawlingModal");
  265. dxmModal.show("#repeatCrawlModal");
  266. //$("#loading").modal("hide");
  267. }
  268. }
  269. //记录采集结果 (最后一条+没有下一页才显示)
  270. Crawl.recordCrawlResult(result, data.url, isRepeat, crawlCategory);
  271. if (!dataArr.length && checkFinish && total === temuBatchCrawlNum){
  272. Crawl.displayCrawlResult(checkFinish && total === temuBatchCrawlNum);
  273. }else if(dataArr.length && checkFinish && total === temuBatchCrawlNum){
  274. dxmModal.hide("#crawlingModal");
  275. dxmModal.show("#repeatCrawlModal");
  276. temuBatchCrawlNum = 0;//清除计数
  277. timeOut = 0;
  278. }
  279. }else{
  280. //判断是否重复
  281. if (+isRepeat === 1){
  282. dataArr.push(data);
  283. html = '<tr class="content"><td class="has-ipt">' +
  284. '<input name="sourceUrlRepeat" type="checkbox" value="' + data.url + '">' +
  285. '</td><td class="img-box"><div class="img-out">' +
  286. '<img class="imgCss" src="' + objCrawl.imgUrl.split("|")[0] +
  287. '" width="50px" height="50px"/></div></td>' +
  288. '<td><a href="' + data.url + '" target="_blank">' + objCrawl.name + '</a></td>' +
  289. '<td class="num dxm-f-right">' + objCrawl.price + '</td></tr>';
  290. $('#repeatCrawlModal').find('#repeatValue').append(html).end().find('input[name = "sourceUrlRepeat"]').prop('checked', false);
  291. }
  292. Crawl.recordCrawlResult(result, data.url, isRepeat, crawlCategory);
  293. }
  294. });
  295. }, timeOut);
  296. },
  297.  
  298. //分类采集循环调用该方法,进行当前循环的单个产品采集
  299. singleCrawl : function(uid, urls, checkFinish,haveNext,crawlCategory, crawlState){
  300. var that = this,
  301. urlArr = [],
  302. total = 0,
  303. processNum = 0,
  304. timeOut = 4000,
  305. shopeeUrl = '';
  306.  
  307. temuBatchCrawlNum = 0;
  308. if (urls) {
  309. $.each(urls.split('\n'), function (i, j) {//去重处理
  310. var urlStr = $.trim(j);
  311.  
  312. if (urlStr && urlArr.indexOf(urlStr) === -1) urlArr.push(urlStr);
  313. if (crawlState === 'batchCrawlInit' && urlStr && !shopeeUrl && urlStr.indexOf('.xiapibuy.com/') !== -1 || urlStr.indexOf('/shopee.com') !== -1) shopeeUrl = urlStr;
  314. });
  315. }
  316. total = urlArr.length;
  317. //检测链接里面是否含有shopee平台链接,如果有,可能要走接口获取,先优先创建一个页面加载出请求头所带的参数,后面直接拿这份参数去走接口获取
  318. if (crawlState === 'batchCrawlInit' && shopeeUrl) {
  319. ShopeeCrawl.headersData = '';//清空重新初始化
  320. chrome.runtime.sendMessage('', {sign: 'getShopeeHeaderData', shopeeUrl: shopeeUrl}, function (res) {});
  321. return
  322. }
  323. for (var i in urlArr) {
  324. //去除链接中的逗号
  325. if(urlArr[i].indexOf(",")>-1){
  326. urlArr[i] = urlArr[i].replace(/,/g,"");
  327. }
  328. if(urlArr[i].indexOf('http:') !== 0 && urlArr[i].indexOf('https') !== 0){
  329. urlArr[i] = 'https:' + urlArr[i];
  330. }
  331. if (urlArr[i] && urlArr[i].indexOf("aliexpress.com/store/product/") !== -1 && urlArr[i].indexOf(".html") !== -1) {
  332. var itemId = urlArr[i].substring(urlArr[i].indexOf("aliexpress.com/store/product/") + 29, urlArr[i].indexOf(".html"));
  333. if (itemId && itemId.indexOf("/") !== -1) {
  334. var itemIdArray = itemId.split("/");
  335. urlArr[i] = urlArr[i].replace(itemIdArray[0]+'/',"");
  336. var newItemId = itemIdArray[1];
  337. if(urlArr[i] && newItemId && newItemId.indexOf("_") !== -1){
  338. var idArrays = newItemId.split("_");
  339. urlArr[i] = urlArr[i].replace(newItemId,idArrays[1]).replace("store/product","item");
  340. }
  341. }
  342. }
  343. var crawlObj = Crawl.getCrawlObject("single", urlArr[i]);
  344. crawlObj && crawlObj.crawl(urlArr[i], function(data){
  345. data.uid = uid;
  346. data.repeatCheck = 1; //必须查重
  347. // 截取亚马逊商品的链接 去除qid字符串
  348. var crawlUrl = data.url;
  349. if(crawlUrl.indexOf('www.amazon.com') > -1){
  350. if (crawlUrl.indexOf('qid') > -1){
  351. var begin = crawlUrl.indexOf('qid');
  352. var qidStr = crawlUrl.substring(begin, begin + 15);
  353. data.url = crawlUrl.replace(qidStr, '');
  354. }
  355. }
  356.  
  357. if(data.html){
  358. if (data.isTemu) {//如果是拼多多跨境批量操作,则单独加延迟处理,防止请求太快导致大量采集失败的问题
  359. that.temuBatchAjaxTimeOut(data, crawlCategory, checkFinish, total, timeOut);
  360. } else {
  361. Html.postHtml(URL_MANAGER.url.postHTML(), data, 0, function(result){
  362. processNum++;
  363. if(result.code == -10) {$.fn.message({type:'danger',msg:result.msg})}
  364. if(result.code == -11) {$.fn.message({type:'danger',msg:result.msg})}
  365. var objCrawl = result.repeatCrawlProduct;
  366.  
  367. var isRepeat = 0;
  368. if(objCrawl){
  369. isRepeat = objCrawl.repeatCrawl;
  370. }
  371.  
  372. //价格为null 显示空串
  373. if(objCrawl && objCrawl.price == null){
  374. result.repeatCrawlProduct.price = "";
  375. }
  376. if(!crawlCategory){
  377. if(isRepeat == 1){//重复,生成td,并记录重复的data
  378. dataArr.push(data);
  379.  
  380. var html = '<tr><td style="width:50px;">' +
  381. '<input name="sourceUrlRepeat" type="checkbox" value="' + data.url + '"/></td>' +
  382. '<td style="width:80px;"><div class="imgDivOut"><div class="imgDivIn">' +
  383. '<img src="' + result.repeatCrawlProduct.imgUrl.split('|')[0] +
  384. '" class="imgCss" width="71px" height="71px"/></div></div></td>' +
  385. '<td style="text-align:left;"><a href="' + data.url +
  386. '" target="_blank">' + result.repeatCrawlProduct.name + '</a></td>' +
  387. '<td style="width:80px;">' + result.repeatCrawlProduct.price + '</td></tr>';
  388.  
  389. contentObj.append(html);
  390. $('#repeatCrawlModal').find('input[name = "sourceUrlRepeat"]').prop('checked', false);
  391. if(checkFinish && total === processNum){//表示采集完成,显示重复采集记录模态层
  392. dxmModal.hide("#crawlingModal");
  393. dxmModal.show("#repeatCrawlModal");
  394. //$("#loading").modal("hide");
  395. }
  396. }
  397. //记录采集结果 (最后一条+没有下一页才显示)
  398. Crawl.recordCrawlResult(result, data.url, isRepeat, crawlCategory);
  399. if(dataArr.length == 0 && checkFinish && total === processNum){
  400. Crawl.displayCrawlResult(checkFinish && total === processNum);
  401. }else if(dataArr.length > 0 && checkFinish && total === processNum){
  402. dxmModal.hide("#crawlingModal");
  403. dxmModal.show("#repeatCrawlModal");
  404. }
  405.  
  406. //清空下tiktok采集的数据选中站点数据
  407. if (total === processNum && checkFinish) {
  408. $('#tiktokGatherDataId').val('').attr('data-tiktok', '');
  409. }
  410. }else{
  411. //判断是否重复
  412. if(isRepeat == 1){
  413. dataArr.push(data);
  414. var html = '<tr class="content"><td class="has-ipt">' +
  415. '<input name="sourceUrlRepeat" type="checkbox" value="' + data.url + '">' +
  416. '</td><td class="img-box"><div class="img-out">' +
  417. '<img class="imgCss" src="' + objCrawl.imgUrl.split("|")[0] +
  418. '" width="50px" height="50px"/></div></td>' +
  419. '<td><a href="' + data.url + '" target="_blank">' + objCrawl.name + '</a></td>' +
  420. '<td class="num dxm-f-right">' + objCrawl.price + '</td></tr>';
  421. $("#repeatCrawlModal").find('#repeatValue').append(html);
  422. $('#repeatCrawlModal').find('input[name = "sourceUrlRepeat"]').prop('checked', false);
  423. }
  424. Crawl.recordCrawlResult(result, data.url, isRepeat, crawlCategory);
  425. }
  426.  
  427. });
  428. }
  429. }else{
  430. if(!crawlCategory){
  431. processNum++;
  432. data.code = -1;
  433. data.msg = "采集内容为空!";
  434. Crawl.recordCrawlResult(data, data.url, 0, crawlCategory);
  435. if(dataArr.length == 0 && checkFinish && total === processNum){
  436. Crawl.displayCrawlResult(checkFinish && total === processNum);
  437. }else if(dataArr.length > 0 && checkFinish && total === processNum){
  438. $('#repeatCrawlModal').find('input[name = "sourceUrlRepeat"]').prop('checked', false);
  439. dxmModal.hide("#crawlingModal");
  440. dxmModal.show("#repeatCrawlModal");
  441. //$("#loading").modal("hide");
  442. }
  443. } else {
  444. Crawl.recordCrawlResult({}, data.url, 0, crawlCategory);
  445. }
  446. }
  447. }, false, false);
  448. }
  449. },
  450.  
  451. //分类采集入口,crawlCategory=true则是列表页分类采集,false则是插件详情页分类链接采集
  452. categoryCrawl: function(uid, url, crawlCategory, categoryNum){
  453. var crawlObj = Crawl.getCrawlObject('category', url); //当前是采集的哪个平台的分类采集,获取对应平台的对象函数
  454.  
  455. // 新版分类采集逻辑,必须当前产品采集完成之后才会调用下一个产品的采集
  456. // 速卖通、天猫、淘宝、Amazon的分类采集先单独走一个逻辑,后续再把所有的分类采集都替换掉
  457. if (this.judgeUrlByPlatform(url) && crawlCategory) { //如果是天猫采集并且是列表页的分类采集
  458. Crawl.newCategoryCrawlProcess(uid, url, crawlObj, crawlCategory, categoryNum, categoryNum, 1);
  459. } else { //其它平台先走之前的逻辑,所有产品一次性调用采集
  460. Crawl.categoryCrawlProcess(uid, url, crawlObj, crawlCategory, categoryNum, 0, 1);
  461. }
  462. },
  463.  
  464. // 验证是否需要调用新版分类采集逻辑,新版采集逻辑必须是当前产品采集完成之后才会调用下一个产品的采集
  465. judgeUrlByPlatform: function (url) {
  466. var flag = false;
  467.  
  468. // 指定平台的分类采集先单独走一个逻辑,后续再把所有的分类采集都替换掉
  469. if (url.indexOf('aliexpress.') !== -1) { //速卖通
  470. flag = true;
  471. } else if (url.indexOf('taobao.com') !== -1) { //淘宝
  472. flag = true;
  473. } else if (url.indexOf('tmall.com') !== -1 || url.indexOf('tmall.hk') !== -1) { //天猫
  474. flag = true;
  475. } else if (url.indexOf('amazon.') !== -1) { //Amazon
  476. flag = true;
  477. }
  478. return flag; //返回true则是调用新版分类采集逻辑
  479. },
  480.  
  481. //初始化分类采集的计数等字段值
  482. initCategoryNumFn: function () {
  483. dataArr = [];
  484. jj = 0;
  485. failsUrls = '';
  486. Crawl.categoryCrawlTotalNum = 0;
  487. Crawl.categoryCrawlCountNum = 0;
  488. Crawl.categoryDataList = [];
  489. },
  490.  
  491. //获取分类采集的每一页的要采集的产品数据,产品分类列表页采集和插件详情页输入链接采集都调用的这个方法
  492. categoryCrawlProcess: function(uid, url, crawlObj,crawlCategory,categoryNum, setTimeCount, pageNum){
  493. crawlObj && crawlObj.crawl(url, function(data){ // 回调函数
  494. if (!data) return;//为空则是有滑动条暂停采集
  495. if (data.list && data.list.length > 0) {
  496. Crawl.categoryCrawlTotalNum += data.list.length;
  497. var begin = (pageNum - 1) * data.list.length;
  498. pageNum++;
  499. if (url.indexOf("aliexpress.com") > 0 || url.indexOf("ebay.com") > 0) {
  500. for (var i in data.list) {
  501. if (categoryNum) {
  502. if(data.list.length > categoryNum){
  503. if (i < categoryNum) {
  504. setTimeout(function () {
  505. Crawl.singleCrawl(uid, data.list[setTimeCount], Crawl.categoryCrawlCountNum === (Crawl.categoryCrawlTotalNum - 1), true, crawlCategory); //标志分类采集已经结束
  506. Crawl.categoryCrawlCountNum++;
  507. setTimeCount++;
  508. if (setTimeCount >= categoryNum) {
  509. setTimeCount = 0;
  510. }
  511. }, (begin + parseInt(i)) * 1000);
  512. }
  513. }else {
  514. setTimeout(function () {
  515. Crawl.singleCrawl(uid, data.list[setTimeCount], Crawl.categoryCrawlCountNum === (Crawl.categoryCrawlTotalNum - 1), true, crawlCategory); //标志分类采集已经结束
  516. Crawl.categoryCrawlCountNum++;
  517. setTimeCount++;
  518. if (setTimeCount >= data.list.length) {
  519. setTimeCount = 0;
  520. }
  521. }, (begin + parseInt(i)) * 1000);
  522. }
  523. } else {
  524. setTimeout(function () {
  525. Crawl.singleCrawl(uid, data.list[setTimeCount], Crawl.categoryCrawlCountNum === (Crawl.categoryCrawlTotalNum - 1), true, crawlCategory); //标志分类采集已经结束
  526. Crawl.categoryCrawlCountNum++;
  527. setTimeCount++;
  528. if (setTimeCount >= data.list.length) {
  529. setTimeCount = 0;
  530. }
  531. }, (begin + parseInt(i)) * 1000);
  532. }
  533. }
  534. } else if (url.indexOf('taobao.com') > 0 || url.indexOf('tmall.com') > 0 || url.indexOf('tmall.hk') > 0) {
  535. for (var i in data.list) {
  536. if (categoryNum) {
  537. if(data.list.length > categoryNum){
  538. if (i < categoryNum) {
  539. setTimeout(function () {
  540. Crawl.singleCrawl(uid, data.list[setTimeCount], Crawl.categoryCrawlCountNum == (Crawl.categoryCrawlTotalNum - 1), true, crawlCategory); //标志分类采集已经结束
  541. Crawl.categoryCrawlCountNum++;
  542. setTimeCount++;
  543. if (setTimeCount >= categoryNum) {
  544. setTimeCount = 0;
  545. }
  546. }, (begin + parseInt(i)) * 5000);
  547. }
  548. }else {
  549. setTimeout(function () {
  550. Crawl.singleCrawl(uid, data.list[setTimeCount], Crawl.categoryCrawlCountNum === (Crawl.categoryCrawlTotalNum - 1), true, crawlCategory); //标志分类采集已经结束
  551. Crawl.categoryCrawlCountNum++;
  552. setTimeCount++;
  553. if (setTimeCount >= data.list.length) {
  554. setTimeCount = 0;
  555. }
  556. }, (begin + parseInt(i)) * 5000);
  557. }
  558. } else {
  559. setTimeout(function () {
  560. Crawl.singleCrawl(uid, data.list[setTimeCount], Crawl.categoryCrawlCountNum === (Crawl.categoryCrawlTotalNum - 1), true, crawlCategory); //标志分类采集已经结束
  561. Crawl.categoryCrawlCountNum++;
  562. setTimeCount++;
  563. if (setTimeCount >= data.list.length) {
  564. setTimeCount = 0;
  565. }
  566. }, (begin + parseInt(i)) * 5000);
  567. }
  568. }
  569. } else {
  570. //判断是否结束需要两个条件 是否是最后一个+是否还有下一页
  571. for(var i in data.list){
  572. if(categoryNum){
  573. if (data.list.length > categoryNum) {
  574. if (i < categoryNum) {
  575. Crawl.singleCrawl(uid, data.list[i], Crawl.categoryCrawlCountNum === (Crawl.categoryCrawlTotalNum - 1), !data.next, crawlCategory); //标志分类采集已经结束
  576. Crawl.categoryCrawlCountNum++;
  577. }
  578. }else{
  579. Crawl.singleCrawl(uid, data.list[i], Crawl.categoryCrawlCountNum === (Crawl.categoryCrawlTotalNum-1),!data.next,crawlCategory); //标志分类采集已经结束
  580. Crawl.categoryCrawlCountNum++;
  581. }
  582.  
  583. }else{
  584. Crawl.singleCrawl(uid, data.list[i], Crawl.categoryCrawlCountNum === (Crawl.categoryCrawlTotalNum-1),!data.next,crawlCategory); //标志分类采集已经结束
  585. Crawl.categoryCrawlCountNum++;
  586. }
  587. }
  588. }
  589. } else {
  590. if(!crawlCategory && pageNum === 1){ //只有请求第一页的时候就没有采集数据时才进入
  591. $("#failDetailDiv").show();
  592. $("#failDetail").append("<p>采集结果为空,请检查是否为分类采集!</p>");
  593. Crawl.displayCrawlResult(false);
  594. }
  595. }
  596. if(crawlCategory){
  597. $('.dxmTotalNum').text(Crawl.categoryCrawlTotalNum)
  598. }
  599.  
  600. if(data.next){
  601. if(categoryNum){
  602. if(data.list.length < categoryNum){
  603. categoryNum = categoryNum - data.list.length;
  604. Crawl.categoryCrawlProcess(uid,data.next,crawlObj,crawlCategory,categoryNum, 0, pageNum);
  605. } else {
  606. Crawl.categoryCrawlTotalNum = Crawl.categoryCrawlTotalNum - (data.list.length - categoryNum);
  607. }
  608. }else{
  609. Crawl.categoryCrawlProcess(uid, data.next, crawlObj,crawlCategory, categoryNum, 0, pageNum);
  610. }
  611. } else {
  612. if(categoryNum){
  613. if(data.list.length > categoryNum){
  614. Crawl.categoryCrawlTotalNum = Crawl.categoryCrawlTotalNum - (data.list.length - categoryNum);
  615. }
  616. }
  617. }
  618.  
  619. if (data.list && !data.list.length) {
  620. var $msgModal = $('#msgModal');
  621. // $('#dxmCopyUrl').val(url);
  622. // $msgModal.find('.crawProgressBar').css('width', '100%');
  623. $('#myModalLabel').text('采集成功');
  624. $msgModal.find('.dxmCategoryCrawlNumBox').show();
  625. }
  626. }, pageNum, {uid: uid,
  627. url: url,
  628. crawlObj: crawlObj,
  629. crawlCategory: crawlCategory,
  630. categoryNum: categoryNum,
  631. setTimeCount: setTimeCount});
  632. },
  633.  
  634. showCrawlingResult: function(){
  635. $('#crawlDesc').text('数据采集中,请稍后......');
  636. $('#crawlingDesc').text('正在采集......');
  637. dxmModal.show('#crawlingModal');
  638. },
  639.  
  640. displayCrawlResult: function(isFinish){
  641. isFinish && $("#crawlDesc").text("采 集 完 成!");
  642. dxmModal.hide("#crawlingModal");
  643. dxmModal.show("#crawlResultModal");
  644. },
  645.  
  646. /*
  647. * 计算采集的进度
  648. * @param url=要采集的分类产品链接
  649. * @param repeat=是否重复采集,0没重复,1重复
  650. * @param crawlCategory=true是列表页的分类采集,false是插件详情页的分类链接采集
  651. * @param isRepeatCrawlConfirm=是否是重复采集弹层的确认重复采集
  652. * @param repeatCrawlCount=重复采集弹层的确认重复采集后,勾选的需要重复采集的产品数量
  653. * */
  654. recordCrawlResult: function(data, url, repeat, crawlCategory, isRepeatCrawlConfirm, repeatCrawlCount){
  655. var dxmTotalNum = Crawl.categoryCrawlTotalNum, //+($('.dxmTotalNum').text()),
  656. $msgModal = $('#msgModal'),
  657. $successNum = !crawlCategory ? $('#successNum') : $msgModal.find('.dxm-f-blue'),
  658. $failNum = !crawlCategory ? $('#failNum') : $msgModal.find('.dxmFail'),
  659. successNum = parseInt($successNum.text()),
  660. failNum = parseInt($failNum.text());
  661. $('#checkFailUrl').show();
  662. if (!crawlCategory) {
  663. $('#crawlingDesc').html('正在采集:' + url);
  664. if (+repeat !== 1) {
  665. if (!+data.code) {
  666. $successNum.text(successNum + 1);
  667. } else {
  668. $failNum.text(failNum + 1);
  669. failsUrls += url + '\n';
  670. if (data.msg) {
  671. $('#dxmCopyUrl').val(failsUrls);
  672. $('#failDetailDiv').show();
  673. $('#failDetail').append('<p>' + data.msg + '<br/>采集url:' + url + '</p>');
  674. }
  675. }
  676. }
  677. } else {
  678. if (+repeat !== 1) {
  679. if (!+data.code) {
  680. if (isNaN(successNum)) successNum = 0;
  681. successNum += 1;
  682. $msgModal.find('.completionNum').text(successNum);
  683. $successNum.text(successNum);
  684. if (+$msgModal.find('.dxmCount').text() > 0) {
  685. $msgModal.find('.crawProgressBar').css('width', ((successNum + (+$failNum.text())) / $msgModal.find('.dxmCount').text()) * 100 + '%');
  686. } else {
  687. $msgModal.find('.crawProgressBar').css('width', ((successNum + (+$failNum.text())) / dxmTotalNum) * 100 + '%');
  688. }
  689. if (showCrawlFailState) Crawl.getCrawlSuccessData(data.id);//收集采集成功的id数据,用于删除获取失败数据
  690. } else {
  691. if (isNaN(failNum)) failNum = 0;
  692. failNum += 1;
  693. $failNum.text(failNum);
  694. failsUrls += url + '\n';
  695. if (+$msgModal.find('.dxmCount').text() > 0) {
  696. $msgModal.find('.crawProgressBar').css('width', (+$successNum.text() + failNum / $msgModal.find('.dxmCount').text()) * 100 + '%');
  697. } else {
  698. $msgModal.find('.crawProgressBar').css('width', ((+$successNum.text() + failNum) / dxmTotalNum) * 100 + '%');
  699. }
  700. }
  701. if ((+$successNum.text() + failNum) === dxmTotalNum || (+$successNum.text() + failNum) === +$msgModal.find('.dxmCount').text()) {
  702. $('#dxmCopyUrl').val(failsUrls);
  703. failsUrls = '';
  704. $('#myModalLabel').text('采集成功');
  705. $successNum.text($successNum.text());
  706. $failNum.text($failNum.text());
  707. $msgModal.find('.dxmCategoryCrawlNumBox').show();
  708. // 如果采集是淘宝和天猫平台,则将采集失败的数据保存到到浏览器数据库
  709. if (showCrawlFailState) Crawl.startSaveCrawlFailData();
  710. }
  711. }
  712. }
  713. // 统计当前分类采集完成的次数
  714. jj++;
  715. // 如果当前分类采集完成的次数等于了分类采集需要采集的总数,并且有重复采集的产品信息
  716. // 弹出重复采集的产品弹层,让用户确认是否采集这些重复的产品
  717. if (jj === dxmTotalNum && dataArr.length) dxmModal.show('#repeatCrawlModal');
  718. // 如果是重复采集弹层确认,并且当前是确认重复采集的最后一个产品则进入if
  719. failsUrls && $('#dxmCopyUrl').val(failsUrls);
  720. if (isRepeatCrawlConfirm && jj - dxmTotalNum === repeatCrawlCount) {
  721. failsUrls = '';
  722. $('#myModalLabel, #crawlDesc').text('采集成功');
  723. $msgModal.find('.dxmCategoryCrawlNumBox').show();
  724. }
  725. },
  726.  
  727. //采集失败数据开始保存
  728. startSaveCrawlFailData: function() {
  729. if (failData) {
  730. failData.isFinish = true;//保存数据
  731. //采集失败的数据传递到background.js
  732. var crawlFailNum = +($('#msgModal .dxmFail').text()),
  733. oldCrawlData = [],
  734. crawlFailData = [],
  735. crawlFailIdArr = [];//当前采集失败数据的产品id,用于判断是否重复,如果重复,删除旧数据,使用采集过来的新数据
  736.  
  737. $('.dxmCrawlFailLink')[crawlFailNum ? 'removeClass' : 'addClass']('dxm-hide');//如果有采集失败才放开入口
  738. if (!crawlFailNum) return;//没有采集失败的不做保存处理,直接结束掉程序
  739. //如果有采集成功的情况,则在所有采集数据里面移除采集成功的数据,则剩余数据判为采集失败的
  740. if (crawlFailNum && failData.data.length) {
  741. if (failData.crawlSuccessArr.length) {//如果有部分采集成功
  742. $.each(failData.data, function (i, j) {
  743. //移除采集成功的数据以及本地采集数据去重
  744. if (failData.crawlSuccessArr.indexOf(j.pro_id) === -1 && crawlFailIdArr.indexOf(j.pro_id) === -1) {
  745. crawlFailData.push(j);
  746. crawlFailIdArr.push(j.pro_id);
  747. }
  748. });
  749. } else {//如果都是采集失败的
  750. crawlFailData = failData.data;
  751. }
  752. }
  753.  
  754. //先查询一遍,再做数据的替换更新
  755. chrome.runtime.sendMessage('', {sign: 'dxmCrawlFail', action: 'getCrawlFailData', data: ''}, function (oldData) {
  756. if (!+oldData.code) oldCrawlData = oldData.result;//取旧数据
  757. Crawl.crawlFailDataHandle(failData.uid, oldCrawlData, crawlFailData, crawlFailIdArr);
  758. });
  759. }
  760. },
  761.  
  762. //获取当前采集成功的产品id
  763. getCrawlSuccessData: function(id) {
  764. if (id && failData.crawlSuccessArr.indexOf(id) === -1) failData.crawlSuccessArr.push(id);//采集成功的id
  765. },
  766.  
  767. //采集失败数据重新处理
  768. crawlFailDataHandle: function(userId, oldData, crawlFailData, crawlFailIdArr) {
  769. var newData = [];
  770.  
  771. if (oldData && oldData.length) {//取本地存储的历史数据(距离当前时间三天内的采集失败的数据)
  772. var nowTime = (new Date()).getTime(),
  773. threeDayTime = 1000*60*60*24*3,
  774. lastTime = nowTime - threeDayTime;//最后到期的时候
  775.  
  776. $.each(oldData, function (i, j) {//取未过期数据、对旧数据去重
  777. if (lastTime <= j.updateTime && crawlFailIdArr.indexOf(j.pro_id) === -1) {
  778. crawlFailData.push(j);//只把未过期的数据插入到后面,最新的数据在前面
  779. }
  780. });
  781. }
  782.  
  783. if (crawlFailData.length) {
  784. //如果超出1000条数据,则只截取前1000条数据(保留最新的前1000条)
  785. newData = crawlFailData.length > 1000 ? crawlFailData.slice(0, 1000) : JSON.parse(JSON.stringify(crawlFailData));
  786. }
  787. //更新浏览器数据库数据
  788. chrome.runtime.sendMessage('', {sign: 'dxmCrawlFail', action: 'replaceCrawlFailData', data: newData});
  789. },
  790.  
  791. fillUrl : function(url,httpFlag){
  792. if(url){
  793. if(url.indexOf("http") == -1 && url.indexOf("HTTP") == -1){
  794. if(httpFlag){
  795. url = "https:" + url;
  796. }else{
  797. url = "http:" + url;
  798. }
  799. }else if(url.indexOf("http") > 0 || url.indexOf("HTTP") > 0){
  800. var sp = url.indexOf("HTTP") > 0 ? "HTTP" : "http";
  801. var urlArray = url.split(sp);
  802. url = "http" + urlArray[1];
  803. }
  804. }
  805. return url;
  806.  
  807. },
  808.  
  809.  
  810. //递归循环采集产品方法
  811. forEachRequestCrawlUrl: function (uid, url, crawlObj,crawlCategory,categoryNum, pageNum, forIndex) {
  812. if (SmtCategoryCrawl.stop) return;
  813. var checkFinish = Crawl.categoryCrawlCountNum === (Crawl.categoryCrawlTotalNum - 1);
  814. Crawl.newSingleCrawl(uid, Crawl.categoryDataList[forIndex], checkFinish , true, crawlCategory, function () {
  815. forIndex++;
  816. //如果已采数量小于总数量,则进入if进行下一个产品的采集
  817. if (Crawl.categoryCrawlCountNum < Crawl.categoryCrawlTotalNum) {
  818. if (url.indexOf('aliexpress.') > 0 || url.indexOf('ebay.com') > 0
  819. || url.indexOf('tmall.com') > 0|| url.indexOf('tmall.hk') > 0) {
  820. setTimeout(function () {
  821. Crawl.forEachRequestCrawlUrl(uid, url, crawlObj,crawlCategory,categoryNum, pageNum, forIndex);
  822. }, 1000);
  823. } else if (url.indexOf('taobao.com') > 0) {
  824. setTimeout(function () {
  825. Crawl.forEachRequestCrawlUrl(uid, url, crawlObj,crawlCategory,categoryNum, pageNum, forIndex);
  826. }, 3000);
  827. } else {
  828. Crawl.forEachRequestCrawlUrl(uid, url, crawlObj,crawlCategory,categoryNum, pageNum, forIndex);
  829. }
  830. }
  831. }); //标志分类采集已经结束
  832. //累计已采数量
  833. Crawl.categoryCrawlCountNum++;
  834. },
  835.  
  836. /*
  837. * 获取分类采集的每一页的要采集的产品数据,产品分类列表页采集和插件详情页输入链接采集都调用的这个方法
  838. * @param url=要采集的分类产品链接
  839. * @param crawlObj=是采集哪个平台的对象函数
  840. * @param crawlCategory=true是列表页的分类采集,false是插件详情页的分类链接采集
  841. * @param categoryNum=是否只采集指定数量的产品(即最大可采数量),这个字段是插件详情页的分类链接采集那里才会传
  842. * @param remainingCategoryNum=剩余的可采数量,用来计算是否还需要走下一页的请求
  843. * @param pageNum=要采集的产品分页
  844. * */
  845. newCategoryCrawlProcess: function(uid, url, crawlObj, crawlCategory, categoryNum, remainingCategoryNum, pageNum){
  846. //先获取当前页的可采产品数据
  847. crawlObj && crawlObj.crawl(url, function (data) { // 回调函数
  848. if (!data) return;
  849. //判断当前页的可采产品数据是否有值
  850. if (data.list && data.list.length) {
  851. //累加当前已获取到的可采产品数量
  852. Crawl.categoryCrawlTotalNum += data.list.length;
  853.  
  854. //合并需要采集的产品数据
  855. Crawl.categoryDataList = Crawl.categoryDataList.concat(data.list);
  856. if (pageNum === 1 || SmtCategoryCrawl.stop) { //只有第一页的时候才调用,后面只需要合并数据,不需要调用递归方法
  857. //手动递归循环采集,只有当前循环的产品采集完成之后,才会进行下一个产品的采集
  858. var stop = SmtCategoryCrawl.stop;
  859. if (SmtCategoryCrawl.stop) SmtCategoryCrawl.stop = false;
  860. Crawl.forEachRequestCrawlUrl(uid, url, crawlObj, crawlCategory, categoryNum, pageNum, (stop && SmtCategoryCrawl.detailDataObj && SmtCategoryCrawl.detailDataObj.forIndex !== undefined) ? SmtCategoryCrawl.detailDataObj.forIndex : 0);
  861. }
  862.  
  863. pageNum++; //页数+1,用来获取下一页的产品数据
  864. } else {
  865. if (!crawlCategory && pageNum === 1) { //只有请求第一页的时候就没有采集数据时才进入
  866. $('#failDetailDiv').show();
  867. $('#failDetail').append('<p>采集结果为空,请检查是否为分类采集!</p>');
  868. Crawl.displayCrawlResult(false);
  869. }
  870. }
  871.  
  872. if (crawlCategory) { //如果是列表页点击的分类采集,把产品总数渲染到节点上展示
  873. $('.dxmTotalNum').text(Crawl.categoryCrawlTotalNum)
  874. }
  875. if (SmtCategoryCrawl.stop) SmtCategoryCrawl.stop = false;
  876. if (data.next) { //如果有下一页的url
  877. if (remainingCategoryNum) { //判断是否勾选了只采集前多少个(即最大可采产品数量),只有插件详情页通过链接采集时才会有值
  878. //比如当前获取到的产品数量是50,当前剩余可采数量是80,categoryNum最大可采数是80
  879. if(data.list.length < remainingCategoryNum){ //如果当前获取到的产品数量小于剩余的最大可采数量
  880. //用80-50得到最新的剩余可采数量=30,那么下一次进来如果data.list还是50条,就会执行下面的else
  881. remainingCategoryNum = remainingCategoryNum - data.list.length; //拿最大可采数量减去当前产品数量,得到最新的剩余最大可采数量
  882. //继续调用当前方法进行下一页的产品数据请求
  883. Crawl.newCategoryCrawlProcess(uid, data.next, crawlObj, crawlCategory, categoryNum, remainingCategoryNum, pageNum);
  884. } else { //如果当前获取到的产品数量大于剩余可采数量
  885. Crawl.categoryCrawlTotalNum = categoryNum; //直接等于最大可采数量
  886. }
  887. }else{
  888. //继续调用当前方法进行下一页的产品数据请求
  889. Crawl.newCategoryCrawlProcess(uid, data.next, crawlObj, crawlCategory, categoryNum, remainingCategoryNum, pageNum);
  890. }
  891. } else { //如果没有下一页的请求了
  892. if(remainingCategoryNum){ //如果剩余可采数量还有
  893. //如果当前获取到的产品数量大于剩余可采数量
  894. if(data.list.length > remainingCategoryNum){
  895. Crawl.categoryCrawlTotalNum = categoryNum; //直接等于最大可采数量
  896. }
  897. }
  898. }
  899. //如果当前页没有可采产品了
  900. if (data.list && !data.list.length) {
  901. var $msgModal = $('#msgModal');
  902. // $('#dxmCopyUrl').val(url);
  903. // $msgModal.find('.crawProgressBar').css('width', '100%');
  904. $('#myModalLabel').text('采集成功');
  905. $msgModal.find('.dxmCategoryCrawlNumBox').show();
  906. }
  907. }, pageNum, {uid: uid,
  908. url: url,
  909. crawlObj: crawlObj,
  910. crawlCategory: crawlCategory,
  911. categoryNum: categoryNum});
  912. },
  913.  
  914. /*
  915. * 计算采集的进度
  916. * @param url=要采集的分类产品链接
  917. * @param crawlObj=是采集哪个平台的对象函数
  918. * @param crawlCategory=true是列表页的分类采集,false是插件详情页的分类链接采集
  919. * @param categoryNum=是否只采集指定数量的产品(即最大可采数量),这个字段是插件详情页的分类链接采集那里才会传
  920. * @param remainingCategoryNum=剩余的可采数量,用来计算是否还需要走下一页的请求
  921. * @param pageNum=要采集的产品分页
  922. * */
  923. newRecordCrawlResult: function(data, url, repeat, crawlCategory, isRepeatCrawlConfirm, repeatCrawlCount){
  924. var dxmTotalNum = Crawl.categoryCrawlTotalNum, //需要采集的产品总数 //+($('.dxmTotalNum').text()),
  925. $dxmCopyUrl = $('#dxmCopyUrl'), //用来保存失败链接的文本域节点,复制失败链接时用到
  926. $msgModal = $('#msgModal'), //信息展示弹层
  927. $successNum = !crawlCategory ? $('#successNum') : $msgModal.find('.dxm-f-blue'), //判断是列表页的分类采集还是插件详情页的分类链接采集,取不同的节点
  928. $failNum = !crawlCategory ? $('#failNum') : $msgModal.find('.dxmFail'), //判断是列表页的分类采集还是插件详情页的分类链接采集,取不同的节点
  929. successNum = parseInt($successNum.text()), //获取当前已经采集成功的产品数量
  930. failNum = parseInt($failNum.text()); //获取当前已经采集失败的产品数量
  931.  
  932. $dxmCopyUrl.val('');
  933. $('#checkFailUrl').show(); //复制失败链接节点显示
  934. if (!crawlCategory) { //如果是插件详情页分类链接采集则进入if
  935. $('#crawlingDesc').html('正在采集:' + url);
  936. if (+repeat !== 1) { //不是重复采集的情况才进入
  937. if (!+data.code) { //如果采集成功
  938. $successNum.text(successNum + 1); //成功数+1
  939. } else { //如果采集失败
  940. $failNum.text(failNum + 1); //失败数+1
  941. failsUrls += url + '\n'; //失败链接拼接
  942. if (data.msg) {
  943. $dxmCopyUrl.val(failsUrls);
  944. $('#failDetailDiv').show();
  945. $('#failDetail').append('<p>' + data.msg + '<br/>采集url:' + url + '</p>');
  946. }
  947. }
  948. }
  949. } else { //如果是列表页分类采集则进入else
  950. if (+repeat !== 1) { //不是重复采集的情况才进入
  951. if (!+data.code) { //如果采集成功
  952. if (isNaN(successNum)) successNum = 0;
  953. successNum += 1; //成功数+1
  954. $msgModal.find('.completionNum').text(successNum);
  955. $successNum.text(successNum);
  956. //计算进度条进度
  957. if (+$msgModal.find('.dxmCount').text() > 0) {
  958. $msgModal.find('.crawProgressBar').css('width', ((successNum + (+$failNum.text())) / $msgModal.find('.dxmCount').text()) * 100 + '%');
  959. } else {
  960. $msgModal.find('.crawProgressBar').css('width', ((successNum + (+$failNum.text())) / dxmTotalNum) * 100 + '%');
  961. }
  962.  
  963. if (showCrawlFailState) Crawl.getCrawlSuccessData(data.id);//收集采集成功的id数据,用于删除获取失败数据
  964. } else { //如果采集失败
  965. if (isNaN(failNum)) failNum = 0;
  966. failNum += 1; //失败数+1
  967. $failNum.text(failNum);
  968. failsUrls += url + '\n'; //失败链接拼接
  969. //计算进度条进度
  970. if (+$msgModal.find('.dxmCount').text() > 0) {
  971. $msgModal.find('.crawProgressBar').css('width', (+$successNum.text() + failNum / $msgModal.find('.dxmCount').text()) * 100 + '%');
  972. } else {
  973. $msgModal.find('.crawProgressBar').css('width', ((+$successNum.text() + failNum) / dxmTotalNum) * 100 + '%');
  974. }
  975. }
  976. //如果采集成功数加失败数等于总数量,或者采集成功数加失败数等于采集进度弹层里的产品总数节点展示数量
  977. if ((+$successNum.text() + failNum) === dxmTotalNum || (+$successNum.text() + failNum) === +$msgModal.find('.dxmCount').text()) {
  978. $dxmCopyUrl.val(failsUrls);
  979. failsUrls = '';
  980. $('#myModalLabel').text('采集成功');
  981. $successNum.text($successNum.text());
  982. $failNum.text($failNum.text());
  983. $msgModal.find('.dxmCategoryCrawlNumBox').show();
  984. // 如果采集是淘宝和天猫平台,则将采集失败的数据保存到到浏览器数据库
  985. if (showCrawlFailState) Crawl.startSaveCrawlFailData();
  986. }
  987. }
  988. }
  989. // 统计当前分类采集完成的次数-这个判断是正常第一次采集的逻辑判断,非重复采集确认的逻辑
  990. // 如果当前分类采集完成的次数等于了分类采集需要采集的总数,并且有重复采集的产品信息
  991. // 弹出重复采集的产品弹层,让用户确认是否采集这些重复的产品
  992. if (Crawl.categoryCrawlCountNum === dxmTotalNum && dataArr.length) {
  993. $(document).off('click', '#submitRepeatCrawl').on('click', '#submitRepeatCrawl', function () {
  994. submitRepeatCrawl(true);
  995. });
  996. dxmModal.show('#repeatCrawlModal');
  997. }
  998. // 如果是重复采集弹层确认,判断当前已采集数量减去产品总数量是否等于需要重复采集的产品数量
  999. //比如repeatCrawlCount需要重复采集的数量是5,dxmTotalNum产品总数量是10,Crawl.categoryCrawlCountNum已采数量是10
  1000. //那么重复采集确认时,不停累加Crawl.categoryCrawlCountNum已采数量,累加到15,减去总数量的10,则刚好等于需要重复采集的数量5
  1001. if (isRepeatCrawlConfirm && Crawl.categoryCrawlCountNum - dxmTotalNum === repeatCrawlCount) {
  1002. $dxmCopyUrl.val(failsUrls);
  1003. failsUrls = '';
  1004. $('#myModalLabel, #crawlDesc').text('采集成功');
  1005. $msgModal.find('.dxmCategoryCrawlNumBox').show();
  1006. }
  1007. },
  1008.  
  1009. //列表页单个采集\分类采集都会循环调用该方法,进行当前循环的单个产品采集
  1010. newSingleCrawl : function(uid, urls, checkFinish, haveNext, crawlCategory, call){
  1011. var urlArr = [];
  1012. if(urls) urlArr = urls.split('\n');
  1013. var total = urlArr.length,
  1014. processNum = 0;
  1015.  
  1016. var forFn = function (i) {
  1017. //处理amazon链接中有其他数据
  1018. if (urlArr[i].indexOf('amazon.com') !== -1) {
  1019. urlArr[i] = urlArr[i].split(/,|,/)[0].replace('"', '');
  1020. }
  1021. //去除链接中的逗号
  1022. if(urlArr[i].indexOf(',') > -1){
  1023. urlArr[i] = urlArr[i].replace(/,/g, '');
  1024. }
  1025. if(urlArr[i].indexOf('http:') !== 0 && urlArr[i].indexOf('https') !== 0){
  1026. urlArr[i] = 'https:' + urlArr[i];
  1027. }
  1028. //采集链接处理
  1029. if (urlArr[i] && urlArr[i].indexOf('aliexpress.com/store/product/') !== -1 && urlArr[i].indexOf('.html') !== -1) {
  1030. var itemId = urlArr[i].substring(urlArr[i].indexOf('aliexpress.com/store/product/') + 29, urlArr[i].indexOf('.html'));
  1031. if (itemId && itemId.indexOf('/') !== -1) {
  1032. var itemIdArray = itemId.split('/');
  1033. urlArr[i] = urlArr[i].replace(itemIdArray[0] + '/', '');
  1034. var newItemId = itemIdArray[1];
  1035. if(urlArr[i] && newItemId && newItemId.indexOf('_') !== -1){
  1036. var idArrays = newItemId.split('_');
  1037. urlArr[i] = urlArr[i].replace(newItemId,idArrays[1]).replace('store/product', 'item');
  1038. }
  1039. }
  1040. }
  1041. //获取对应的平台采集对象函数
  1042. var crawlObj = Crawl.getCrawlObject('single', urlArr[i]);
  1043. //判断是否获取到对应的平台采集对象函数,有则调用
  1044. crawlObj && crawlObj.crawl(urlArr[i], function(data){
  1045. data.uid = uid;
  1046. data.repeatCheck = 1; //必须查重
  1047. // 截取亚马逊商品的链接 去除qid字符串
  1048. var crawlUrl = data.url;
  1049. if(crawlUrl.indexOf('www.amazon.com')> -1){
  1050. if (crawlUrl.indexOf('qid') > -1){
  1051. var begin = crawlUrl.indexOf('qid'),
  1052. qidStr = crawlUrl.substring(begin, begin + 15);
  1053. data.url = crawlUrl.replace(qidStr, '');
  1054. }
  1055. }
  1056.  
  1057. if(data.html){ //如果获取到页面数据了
  1058. //调用请求存到店小秘的数据采集页
  1059. Html.postHtml(URL_MANAGER.url.postHTML(), data, 0, function(result){
  1060. processNum++;
  1061.  
  1062. //把处理采集结果的逻辑代码抽出去
  1063. Crawl.postHtmlResultFn(result, data, crawlCategory, checkFinish, total, processNum);
  1064.  
  1065. if (i < urlArr.length - 1) { //如果还有下一个则继续调用方法采集
  1066. i++;
  1067. forFn(i);
  1068. } else { //没有则调用回调函数
  1069. typeof call === 'function' && call();
  1070. }
  1071. });
  1072. }else{
  1073. if(!crawlCategory){ //如果是插件详情页的分类链接采集
  1074. processNum++;
  1075. data.code = -1;
  1076. data.msg = '采集内容为空!';
  1077. //计算采集结果进度
  1078. Crawl.newRecordCrawlResult(data, data.url, 0, crawlCategory);
  1079. if(checkFinish && total === processNum){
  1080. if (dataArr.length) {
  1081. $('#repeatCrawlModal').find('input[name="sourceUrlRepeat"]').prop('checked', false);
  1082. dxmModal.hide('#crawlingModal');
  1083. dxmModal.show('#repeatCrawlModal');
  1084. } else {
  1085. Crawl.displayCrawlResult(checkFinish && total === processNum);
  1086. }
  1087. }
  1088. } else {
  1089. Crawl.newRecordCrawlResult({}, data.url, 0, crawlCategory);
  1090. }
  1091.  
  1092. if (i < urlArr.length - 1) {
  1093. i++;
  1094. forFn(i);
  1095. } else {
  1096. typeof call === 'function' && call();
  1097. }
  1098. }
  1099. }, false, true);
  1100. };
  1101. forFn(0);
  1102. },
  1103.  
  1104. //处理采集结果的逻辑代码
  1105. postHtmlResultFn: function (result, data, crawlCategory, checkFinish, total, processNum) {
  1106. if(+result.code === -10 || +result.code === -11) {
  1107. $.fn.message({type: 'danger', msg: result.msg});
  1108. }
  1109. var objCrawl = result.repeatCrawlProduct,
  1110. isRepeat = 0; //是否重复采集
  1111. if(objCrawl){
  1112. isRepeat = +objCrawl.repeatCrawl;
  1113. }
  1114.  
  1115. //价格为null 显示空串
  1116. if(objCrawl && objCrawl.price == null){
  1117. result.repeatCrawlProduct.price = '';
  1118. }
  1119. if(!crawlCategory){ // 如果是插件详情页的分类链接采集则进入if
  1120. if(isRepeat === 1){//重复,生成td,并记录重复的data
  1121. dataArr.push(data);
  1122.  
  1123. var html = '<tr><td style="width:50px;">' +
  1124. '<input name="sourceUrlRepeat" type="checkbox" value="' + data.url + '"/></td>' +
  1125. '<td style="width:80px;"><div class="imgDivOut"><div class="imgDivIn">' +
  1126. '<img src="' + result.repeatCrawlProduct.imgUrl.split('|')[0]+ '"' +
  1127. ' class="imgCss" width="71px" height="71px"/></div></div></td>' +
  1128. '<td style="text-align:left;"><a href="' + data.url + '"' +
  1129. ' target="_blank">' + result.repeatCrawlProduct.name + '</a></td>'; html += '<td style="width:80px;">'+result.repeatCrawlProduct.price+'</td></tr>';
  1130.  
  1131. contentObj.append(html);
  1132. $('#repeatCrawlModal').find('input[name="sourceUrlRepeat"]').prop('checked', false);
  1133. if (checkFinish && total === processNum && !SmtCategoryCrawl.stop) {//表示采集完成,显示重复采集记录模态层
  1134. dxmModal.hide('#crawlingModal');
  1135. dxmModal.show('#repeatCrawlModal');
  1136. }
  1137. }
  1138. //记录采集结果 (最后一条+没有下一页才显示)
  1139. Crawl.newRecordCrawlResult(result, data.url, isRepeat, crawlCategory);
  1140. if(checkFinish && total === processNum && !SmtCategoryCrawl.stop){
  1141. if (dataArr.length) {
  1142. dxmModal.hide('#crawlingModal');
  1143. dxmModal.show('#repeatCrawlModal');
  1144. } else {
  1145. Crawl.displayCrawlResult(checkFinish && total === processNum);
  1146. }
  1147. }
  1148. }else{
  1149. //判断是否重复
  1150. if(isRepeat === 1){
  1151. dataArr.push(data);
  1152. var $repeatCrawlModal = $('#repeatCrawlModal'),
  1153. html = '<tr class="content"><td class="has-ipt">' +
  1154. '<input name="sourceUrlRepeat" type="checkbox" value="' + data.url + '">' +
  1155. '</td><td class="img-box"><div class="img-out">' +
  1156. '<img class="imgCss" src="' + objCrawl.imgUrl.split('|')[0] + '"' +
  1157. ' width="50px" height="50px"/></div></td>' +
  1158. '<td><a href="' + data.url + '" target="_blank">' + objCrawl.name + '</a></td>' +
  1159. '<td class="num dxm-f-right">' + objCrawl.price + '</td></tr>';
  1160.  
  1161. $repeatCrawlModal.find('#repeatValue').append(html);
  1162. $repeatCrawlModal.find('input[name="sourceUrlRepeat"]').prop('checked', false);
  1163. }
  1164. Crawl.newRecordCrawlResult(result, data.url, isRepeat, crawlCategory);
  1165. }
  1166. }
  1167. };

QingJ © 2025

镜像随时可能失效,请加Q群300939539或关注我们的公众号极客氢云获取最新地址