Bump version and refactor to use bloom filters for avoiding dups

6 years ago · 2580f4594e
5 changed files with 31 additions and 8 deletions
--- a/deletefb/tools/common.py
+++ b/deletefb/tools/common.py
@ -6,6 +6,9 @@ import time

 from .config import settings

+# Used to avoid duplicates in the log
+from pybloom_live import BloomFilter
+
 from os.path import abspath, relpath, split, isfile
 from selenium.common.exceptions import (
    NoSuchElementException,
@ -59,10 +62,19 @@ def archiver(category):

    log_file = open(log_path, mode="ta", buffering=1)

+    bfilter = BloomFilter(
+            capacity=settings["MAX_POSTS"],
+            error_rate=0.001
+    )
+
    def log(content, timestamp=False):
        if not settings["ARCHIVE"]:
            return

+        if content in bfilter:
+            # This was already archived
+            return
+
        structured_content = {
            "category" : category,
            "content" : content,
@ -71,6 +83,8 @@ def archiver(category):

        log_file.write("{0}\n".format(json.dumps(structured_content)))

+        bfilter.add(content)
+
    return (log_file, log)


--- a/deletefb/tools/config.py
+++ b/deletefb/tools/config.py
@ -1,3 +1,4 @@
 settings = {
-    "ARCHIVE" : True
+    "ARCHIVE" : True,
+    "MAX_POSTS" : 5000
 }
--- a/deletefb/tools/likes.py
+++ b/deletefb/tools/likes.py
@ -47,19 +47,23 @@ def get_page_links(driver):
    return [page.get_attribute("href").replace("www", "mobile") for page in pages]


-def unlike_page(driver, url):
+def unlike_page(driver, url, archive=None):
    """
    Unlikes a page given the URL to it
    Args:
        driver: seleniumrequests.Chrome Driver instance
        url: url string pointing to a page
+        archive: archiver instance

    Returns:
        None

    """
+
    driver.get(url)
-    print(url)
+
+    print("Unliking {0}".format(url))
+
    wait = WebDriverWait(driver, 30)

    actions = ActionChains(driver)
@ -82,6 +86,8 @@ def unlike_page(driver, url):

    click_button(driver, unlike_button)

+    if archive:
+        archive(url)

 def unlike_pages(driver, profile_url):
    """
@ -102,9 +108,9 @@ def unlike_pages(driver, profile_url):

    while urls:
        for url in urls:
-            unlike_page(driver, url)
-        load_likes(driver, profile_url)
+            unlike_page(driver, url, archive=archive_likes)
        try:
+            load_likes(driver, profile_url)
            urls = get_page_links(driver)
        except SELENIUM_EXCEPTIONS:
            # We're done
--- a/deletefb/tools/wall.py
+++ b/deletefb/tools/wall.py
@ -1,10 +1,11 @@
 import time
 from selenium.webdriver.common.action_chains import ActionChains

+from .config import settings
 from .common import SELENIUM_EXCEPTIONS, archiver, click_button

 # Used as a threshold to avoid running forever
-MAX_POSTS = 15000
+MAX_POSTS = settings["MAX_POSTS"]

 def delete_posts(driver,
                 user_profile_url,
--- a/setup.py
+++ b/setup.py
@ -5,7 +5,7 @@ with open("README.md", "r") as fh:

 setuptools.setup(
    name="delete-facebook-posts",
-    version="1.1.1",
+    version="1.1.2",
    author="Wesley Kerfoot",
    author_email="wes@wesk.tech",
    description="A Selenium Script to Delete Facebook Posts",
@ -16,7 +16,8 @@ setuptools.setup(
    install_requires = [
        "selenium",
        "selenium-requests",
-        "requests"
+        "requests",
+        "pybloom-live"
    ],
    classifiers= [
        "Programming Language :: Python :: 3",