From 2580f4594e57b1e3eff7488537a7418f1e310198 Mon Sep 17 00:00:00 2001 From: Wesley Kerfoot Date: Sun, 2 Jun 2019 16:26:13 -0400 Subject: [PATCH] Bump version and refactor to use bloom filters for avoiding dups --- deletefb/tools/common.py | 14 ++++++++++++++ deletefb/tools/config.py | 3 ++- deletefb/tools/likes.py | 14 ++++++++++---- deletefb/tools/wall.py | 3 ++- setup.py | 5 +++-- 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/deletefb/tools/common.py b/deletefb/tools/common.py index cc9e95f..c107abf 100644 --- a/deletefb/tools/common.py +++ b/deletefb/tools/common.py @@ -6,6 +6,9 @@ import time from .config import settings +# Used to avoid duplicates in the log +from pybloom_live import BloomFilter + from os.path import abspath, relpath, split, isfile from selenium.common.exceptions import ( NoSuchElementException, @@ -59,10 +62,19 @@ def archiver(category): log_file = open(log_path, mode="ta", buffering=1) + bfilter = BloomFilter( + capacity=settings["MAX_POSTS"], + error_rate=0.001 + ) + def log(content, timestamp=False): if not settings["ARCHIVE"]: return + if content in bfilter: + # This was already archived + return + structured_content = { "category" : category, "content" : content, @@ -71,6 +83,8 @@ def archiver(category): log_file.write("{0}\n".format(json.dumps(structured_content))) + bfilter.add(content) + return (log_file, log) diff --git a/deletefb/tools/config.py b/deletefb/tools/config.py index 8ca2556..078efca 100644 --- a/deletefb/tools/config.py +++ b/deletefb/tools/config.py @@ -1,3 +1,4 @@ settings = { - "ARCHIVE" : True + "ARCHIVE" : True, + "MAX_POSTS" : 5000 } diff --git a/deletefb/tools/likes.py b/deletefb/tools/likes.py index a245961..3adb151 100644 --- a/deletefb/tools/likes.py +++ b/deletefb/tools/likes.py @@ -47,19 +47,23 @@ def get_page_links(driver): return [page.get_attribute("href").replace("www", "mobile") for page in pages] -def unlike_page(driver, url): +def unlike_page(driver, url, archive=None): """ Unlikes a page given the URL to it Args: driver: seleniumrequests.Chrome Driver instance url: url string pointing to a page + archive: archiver instance Returns: None """ + driver.get(url) - print(url) + + print("Unliking {0}".format(url)) + wait = WebDriverWait(driver, 30) actions = ActionChains(driver) @@ -82,6 +86,8 @@ def unlike_page(driver, url): click_button(driver, unlike_button) + if archive: + archive(url) def unlike_pages(driver, profile_url): """ @@ -102,9 +108,9 @@ def unlike_pages(driver, profile_url): while urls: for url in urls: - unlike_page(driver, url) - load_likes(driver, profile_url) + unlike_page(driver, url, archive=archive_likes) try: + load_likes(driver, profile_url) urls = get_page_links(driver) except SELENIUM_EXCEPTIONS: # We're done diff --git a/deletefb/tools/wall.py b/deletefb/tools/wall.py index f164305..5dcd6ec 100644 --- a/deletefb/tools/wall.py +++ b/deletefb/tools/wall.py @@ -1,10 +1,11 @@ import time from selenium.webdriver.common.action_chains import ActionChains +from .config import settings from .common import SELENIUM_EXCEPTIONS, archiver, click_button # Used as a threshold to avoid running forever -MAX_POSTS = 15000 +MAX_POSTS = settings["MAX_POSTS"] def delete_posts(driver, user_profile_url, diff --git a/setup.py b/setup.py index 7e3f445..ce75ca6 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ with open("README.md", "r") as fh: setuptools.setup( name="delete-facebook-posts", - version="1.1.1", + version="1.1.2", author="Wesley Kerfoot", author_email="wes@wesk.tech", description="A Selenium Script to Delete Facebook Posts", @@ -16,7 +16,8 @@ setuptools.setup( install_requires = [ "selenium", "selenium-requests", - "requests" + "requests", + "pybloom-live" ], classifiers= [ "Programming Language :: Python :: 3",