Last active
July 26, 2023 10:10
-
-
Save daoducminh/acd318f2b307703d6f7657d890b4268b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import os | |
| from datetime import datetime | |
| import pymongo | |
| from scrapy import Request, Spider | |
| from scrapy.exceptions import CloseSpider | |
| from scrapy.http import Response | |
| LIMIT = 1e7 | |
| MAX_OFFSET = LIMIT - 1 | |
| BASE_URL = "https://fbplus.net/lib/scan123a@321/api@234/convert__.php?email=ducthohp95@gmail.com&phone=84{head}{body}" | |
| class MongoPipeline: | |
| def open_spider(self, spider: Spider): | |
| pass | |
| def close_spider(self, spider: Spider): | |
| pass | |
| def process_item(self, item, spider: Spider): | |
| uid = item.get("uid") | |
| phone = item.get("phone") | |
| if phone: | |
| if uid: | |
| body = {"phone": phone, "uid": uid} | |
| # upsert using phone as key | |
| spider.data_coll.replace_one({"phone": phone}, body, upsert=True) | |
| else: | |
| # delete phone if not have uid | |
| spider.data_coll.delete_one({"phone": phone}) | |
| return item | |
| class MySpider(Spider): | |
| name = "phone" | |
| headers = { | |
| "Accept": "application/json, text/javascript, */*; q=0.01", | |
| "Accept-Language": "en-US,en;q=0.9,vi;q=0.8", | |
| "Connection": "keep-alive", | |
| "Referer": "https://fbplus.net/scan-phone-to-uid/", | |
| "Sec-Fetch-Dest": "empty", | |
| "Sec-Fetch-Mode": "cors", | |
| "Sec-Fetch-Site": "same-origin", | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", | |
| "X-Requested-With": "XMLHttpRequest", | |
| "sec-ch-ua": '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"', | |
| "sec-ch-ua-mobile": "?0", | |
| "sec-ch-ua-platform": '"Windows"', | |
| } | |
| custom_settings = { | |
| "LOG_ENABLED": False, | |
| "ITEM_PIPELINES": { | |
| "phone_crawler.MongoPipeline": 300, | |
| }, | |
| "CONCURRENT_REQUESTS": 32, | |
| "CONCURRENT_ITEMS": 2, | |
| } | |
| def __init__(self, cookie=None, phone_head="32", limit="5500", *args, **kwargs): | |
| super(MySpider, self).__init__(*args, **kwargs) | |
| self.cookie = cookie | |
| self.mongo_uri = os.getenv("MONGODB_URI") | |
| self.client = pymongo.MongoClient(self.mongo_uri) | |
| self.db = self.client["phone"] | |
| self.data_coll = self.db[phone_head] | |
| self.meta_coll = self.db["meta"] | |
| self.limit = int(limit) | |
| self.phone_head = phone_head | |
| def closed(self, reason): | |
| # Delete all checked phone but not have uid | |
| # self.data_coll.delete_many({"uid": {"$exists": False}}) | |
| # Close mongo client | |
| self.client.close() | |
| def start_requests(self): | |
| # Get last checkpoint from meta collection with max timestamp | |
| last_checkpoint = self.meta_coll.find_one( | |
| {"phone_head": self.phone_head}, sort=[("started_at", -1)] | |
| ) | |
| start_offset = 0 | |
| if last_checkpoint: | |
| start_offset = last_checkpoint["end_offset"] + 1 | |
| if start_offset >= MAX_OFFSET: | |
| raise CloseSpider("Done") | |
| end_offset = start_offset + self.limit | |
| # Check if end_offset is greater than LIMIT | |
| if end_offset >= MAX_OFFSET: | |
| end_offset = MAX_OFFSET | |
| # Insert many placeholder for phone numbers | |
| self.data_coll.insert_many( | |
| [{"phone": f"{i:07d}"} for i in range(start_offset, end_offset + 1)] | |
| ) | |
| self.meta_coll.insert_one( | |
| { | |
| "phone_head": self.phone_head, | |
| "start_offset": start_offset, | |
| "end_offset": end_offset, | |
| "started_at": datetime.now(), | |
| } | |
| ) | |
| # get not checked phone | |
| a = self.data_coll.find({"uid": {"$exists": False}}) | |
| for p in a: | |
| phone_body = p["phone"] | |
| yield Request( | |
| BASE_URL.format(head=self.phone_head, body=phone_body), | |
| headers=self.headers, | |
| cookies={"PHPSESSID": self.cookie}, | |
| cb_kwargs={"phone_body": phone_body}, | |
| ) | |
| # Start crawling | |
| for i in range(start_offset, end_offset + 1): | |
| phone_body = f"{i:07d}" | |
| yield Request( | |
| BASE_URL.format(head=self.phone_head, body=phone_body), | |
| headers=self.headers, | |
| cookies={"PHPSESSID": self.cookie}, | |
| cb_kwargs={"phone_body": phone_body}, | |
| ) | |
| def parse(self, response: Response, phone_body: str): | |
| a = response.body | |
| if a: | |
| b = json.loads(a) | |
| yield { | |
| "phone": phone_body, | |
| "uid": b.get("uid"), | |
| } | |
| else: | |
| raise CloseSpider("cookie is expired") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment