Skip to content

Instantly share code, notes, and snippets.

@daoducminh
Last active July 26, 2023 10:10
Show Gist options
  • Select an option

  • Save daoducminh/acd318f2b307703d6f7657d890b4268b to your computer and use it in GitHub Desktop.

Select an option

Save daoducminh/acd318f2b307703d6f7657d890b4268b to your computer and use it in GitHub Desktop.
import json
import os
from datetime import datetime
import pymongo
from scrapy import Request, Spider
from scrapy.exceptions import CloseSpider
from scrapy.http import Response
LIMIT = 1e7
MAX_OFFSET = LIMIT - 1
BASE_URL = "https://fbplus.net/lib/scan123a@321/api@234/convert__.php?email=ducthohp95@gmail.com&phone=84{head}{body}"
class MongoPipeline:
def open_spider(self, spider: Spider):
pass
def close_spider(self, spider: Spider):
pass
def process_item(self, item, spider: Spider):
uid = item.get("uid")
phone = item.get("phone")
if phone:
if uid:
body = {"phone": phone, "uid": uid}
# upsert using phone as key
spider.data_coll.replace_one({"phone": phone}, body, upsert=True)
else:
# delete phone if not have uid
spider.data_coll.delete_one({"phone": phone})
return item
class MySpider(Spider):
name = "phone"
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "en-US,en;q=0.9,vi;q=0.8",
"Connection": "keep-alive",
"Referer": "https://fbplus.net/scan-phone-to-uid/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"sec-ch-ua": '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
}
custom_settings = {
"LOG_ENABLED": False,
"ITEM_PIPELINES": {
"phone_crawler.MongoPipeline": 300,
},
"CONCURRENT_REQUESTS": 32,
"CONCURRENT_ITEMS": 2,
}
def __init__(self, cookie=None, phone_head="32", limit="5500", *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
self.cookie = cookie
self.mongo_uri = os.getenv("MONGODB_URI")
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client["phone"]
self.data_coll = self.db[phone_head]
self.meta_coll = self.db["meta"]
self.limit = int(limit)
self.phone_head = phone_head
def closed(self, reason):
# Delete all checked phone but not have uid
# self.data_coll.delete_many({"uid": {"$exists": False}})
# Close mongo client
self.client.close()
def start_requests(self):
# Get last checkpoint from meta collection with max timestamp
last_checkpoint = self.meta_coll.find_one(
{"phone_head": self.phone_head}, sort=[("started_at", -1)]
)
start_offset = 0
if last_checkpoint:
start_offset = last_checkpoint["end_offset"] + 1
if start_offset >= MAX_OFFSET:
raise CloseSpider("Done")
end_offset = start_offset + self.limit
# Check if end_offset is greater than LIMIT
if end_offset >= MAX_OFFSET:
end_offset = MAX_OFFSET
# Insert many placeholder for phone numbers
self.data_coll.insert_many(
[{"phone": f"{i:07d}"} for i in range(start_offset, end_offset + 1)]
)
self.meta_coll.insert_one(
{
"phone_head": self.phone_head,
"start_offset": start_offset,
"end_offset": end_offset,
"started_at": datetime.now(),
}
)
# get not checked phone
a = self.data_coll.find({"uid": {"$exists": False}})
for p in a:
phone_body = p["phone"]
yield Request(
BASE_URL.format(head=self.phone_head, body=phone_body),
headers=self.headers,
cookies={"PHPSESSID": self.cookie},
cb_kwargs={"phone_body": phone_body},
)
# Start crawling
for i in range(start_offset, end_offset + 1):
phone_body = f"{i:07d}"
yield Request(
BASE_URL.format(head=self.phone_head, body=phone_body),
headers=self.headers,
cookies={"PHPSESSID": self.cookie},
cb_kwargs={"phone_body": phone_body},
)
def parse(self, response: Response, phone_body: str):
a = response.body
if a:
b = json.loads(a)
yield {
"phone": phone_body,
"uid": b.get("uid"),
}
else:
raise CloseSpider("cookie is expired")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment