Merge pull request #72 from weskerfoot/delete-convos

Archiving and deleting messenger conversations
6 years ago · 7359747ae4
12 changed files with 317 additions and 29 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,13 @@
+### How to contribute
+
+## Dependencies
+If you are adding any new dependencies, please make sure that both `requirements.txt` and `setup.py` have been updated. Please read [this](https://caremad.io/posts/2013/07/setup-vs-requirement/) if you are confused about the difference between `requirements.txt` and the `install_requires` section.
+
+## Virtualenv
+Always develop with virtualenv, as well as test with `pip install --user .`. This helps make sure implicit dependencies aren't accidentally introduced, and makes sure the average user will be more likely to run it without issues.
+
+## Pull requests
+Feel free to make a pull request! Make sure to give a brief overview of what you did, and why you think it is useful. If you are fixing a specific bug or resolving an issue, then make sure to reference it in your PR.
+
+## Coding style
+Try to be consistent with the existing codebase as much as possible. Things should be modularized. Don't repeat yourself if possible, but don't add needless complexity. Straightforward is often better than clever and optimized.
--- a/README.md
+++ b/README.md
@ -19,7 +19,7 @@ Personally, I did this so I would feel less attached to my Facebook profile

 ## Installation
 You have several options to run it.
-1) Install from PyPI with `pip3 install --user delete-facebook-posts`
+1) Install from PyPI with `pip3 install --user delete-facebook-posts` (recommended)
 2) Clone this repo and run `pip3 install --user .` or do `pip3 install --user
 git+https://github.com/weskerfoot/DeleteFB.git`
 3) Set up a Python virtualenv, activate it, and run `pip3 install -r requirements.txt`, then you can just run `python -m deletefb.deletefb` in the DeleteFB directory.
@ -62,14 +62,7 @@ git+https://github.com/weskerfoot/DeleteFB.git`
 * You may also pass in a code by using the `-F` argument, e.g. `-F 111111`.

 ## Delete By Year
-* The tool supports passing the `--year` flag in order to delete wall posts by
-  year. E.g. `-Y 2010` would delete posts from the year 2010. It is incompatible with any mode other than `wall`.
-
-## Unlike Pages
-* You may use `-M unlike_pages` to unlike all of your pages. The names of the
-  pages will be archived (unless archival is turned off), and this option
-  conflicts with the year option. This will only unlike your *pages* that you
-  have liked. It will *not* unlike anything else (like books or movies).
+* The tool supports passing the `--year` flag in order to delete/archive by year. E.g. `-Y 2010` would only affect posts from 2010.

 ## Archival
 * The tool will archive everything being deleted by default in `.log` files.
--- a/deletefb/deletefb.log
+++ b/deletefb/deletefb.log
--- a/deletefb/deletefb.py
+++ b/deletefb/deletefb.py
@ -4,6 +4,8 @@ from .tools.config import settings
 from .tools.likes import unlike_pages
 from .tools.login import login
 from .tools.wall import delete_posts
+from .tools.conversations import traverse_conversations
+from .tools.comments import delete_comments

 import argparse
 import getpass
@ -21,7 +23,7 @@ def run_delete():
        default="wall",
        dest="mode",
        type=str,
-        choices=["wall", "unlike_pages"],
+        choices=["wall", "unlike_pages", "comments", "conversations"],
        help="The mode you want to run in. Default is `wall' which deletes wall posts"
    )

@ -91,8 +93,8 @@ def run_delete():

    settings["ARCHIVE"] = not args.archive_off

-    if args.year and args.mode != "wall":
-        parser.error("The --year option is only supported in wall mode")
+    if args.year and args.mode not in ("wall", "conversations"):
+        parser.error("The --year option is not supported in this mode")

    args_user_password = args.password or getpass.getpass('Enter your password: ')

@ -112,6 +114,13 @@ def run_delete():

    elif args.mode == "unlike_pages":
        unlike_pages(driver, args.profile_url)
+
+    elif args.mode == "comments":
+        delete_comments(driver, args.profile_url)
+
+    elif args.mode == "conversations":
+        traverse_conversations(driver, year=args.year)
+
    else:
        print("Please enter a valid mode")
        sys.exit(1)
--- a/deletefb/tools/archive.py
+++ b/deletefb/tools/archive.py
@ -1,13 +1,23 @@
 from .config import settings
 from contextlib import contextmanager
 from pathlib import Path
+from datetime import datetime
+from time import time

 import attr
+import cattr
 import json
+import typing
+
+TIME_FORMAT = "%Y-%m-%d %H:%M:%S"

 # Used to avoid duplicates in the log
 from pybloom_live import BloomFilter

+cattr.register_unstructure_hook(
+    datetime, lambda dt: datetime.strftime(dt, format=TIME_FORMAT)
+)
+
 def make_filter():
    return BloomFilter(
        capacity=settings["MAX_POSTS"],
@ -27,10 +37,15 @@ class Archive:
        """
        Archive an object
        """
-        print("Archiving {0}".format(content))
+
+        if hasattr(content, 'name'):
+            print("Archiving {0}".format(content.name))

        if content.name not in self._bloom_filter:
-            self.archive_file.write(json.dumps(attr.asdict(content)) + "\n")
+            self.archive_file.write(json.dumps(cattr.unstructure(content),
+                                               indent=4,
+                                               sort_keys=True) + "\n")
+
            self._bloom_filter.add(content.name)
        return

@ -38,7 +53,7 @@ class Archive:
 def archiver(archive_type):

    archive_file = open(
-        str((Path(".") / Path(archive_type).name).with_suffix(".log")),
+        str((Path(".") / Path(archive_type).name).with_suffix(".log.{0}".format(time()))),
        mode="ta",
        buffering=1
    )
--- a/deletefb/tools/comments.py
+++ b/deletefb/tools/comments.py
@ -0,0 +1,17 @@
+from .archive import archiver
+from ..types import Comment
+from .common import SELENIUM_EXCEPTIONS, logger, click_button
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+LOG = logger(__name__)
+
+def delete_comments(driver, profile_url):
+    """
+    Remove all comments on posts
+    """
+
+    driver.get("{0}/allactivity?privacy_source=activity_log&category_key=commentscluster".format(profile_url))
+
+    wait = WebDriverWait(driver, 20)
--- a/deletefb/tools/common.py
+++ b/deletefb/tools/common.py
@ -1,14 +1,19 @@
 from os.path import isfile
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
 from selenium.common.exceptions import (
    NoSuchElementException,
    StaleElementReferenceException,
-    TimeoutException
+    TimeoutException,
+    JavascriptException
 )

 import json
 import logging
 import logging.config
 import os
+import pendulum

 SELENIUM_EXCEPTIONS = (
    NoSuchElementException,
@ -19,13 +24,18 @@ SELENIUM_EXCEPTIONS = (
 def click_button(driver, el):
    """
    Click a button using Javascript
-    Args:
-        driver: seleniumrequests.Chrome Driver instance
-    Returns:
-        None
    """
    driver.execute_script("arguments[0].click();", el)

+def scroll_to(driver, el):
+    """
+    Scroll an element into view, using JS
+    """
+    try:
+        driver.execute_script("arguments[0].scrollIntoView();", el)
+    except SELENIUM_EXCEPTIONS:
+        return
+
 def logger(name):
    """
    Args:
@ -45,6 +55,17 @@ def logger(name):
        logging.config.dictConfig(config["logging"])
    return logging.getLogger(name)

+
+def wait_xpath(driver, expr):
+    """
+    Takes an XPath expression, and waits at most 20 seconds until it exists
+    """
+    wait = WebDriverWait(driver, 20)
+    try:
+        wait.until(EC.presence_of_element_located((By.XPATH, expr)))
+    except SELENIUM_EXCEPTIONS:
+        return
+
 NO_CHROME_DRIVER = """
 You need to install the chromedriver for Selenium\n
 Please see this link https://github.com/weskerfoot/DeleteFB#how-to-use-it\n
--- a/deletefb/tools/conversations.py
+++ b/deletefb/tools/conversations.py
@ -0,0 +1,179 @@
+from .archive import archiver
+from ..types import Conversation, Message
+from .common import SELENIUM_EXCEPTIONS, logger, click_button, wait_xpath
+from .config import settings
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.support.ui import Select
+from pendulum import now
+from json import loads
+
+import lxml.html as lxh
+
+LOG = logger(__name__)
+
+def get_conversations(driver):
+    """
+    Get a list of conversations
+    """
+
+    wait_xpath(driver, "//div[@id=\"threadlist_rows\"]")
+
+    # This function *cannot* be a generator
+    # Otherwise elements will become stale
+    conversations = []
+
+    while True:
+        for convo in driver.find_elements_by_xpath("//a"):
+            url = convo.get_attribute("href")
+
+            date = None
+
+            if url and "messages/read" in url:
+
+                date = convo.find_element_by_xpath("../../..//abbr").text
+                conversation_name = convo.find_element_by_xpath("../../../div/div/header/h3").text.strip()
+
+                assert(conversation_name)
+                assert(url)
+
+                conversations.append(
+                    Conversation(
+                        url=url,
+                        date=date,
+                        name=conversation_name
+                    )
+                )
+
+        try:
+            next_url = (driver.find_element_by_id("see_older_threads").
+                        find_element_by_xpath("a").
+                        get_attribute("href"))
+
+        except SELENIUM_EXCEPTIONS:
+            break
+        if not next_url:
+            break
+        driver.get(next_url)
+
+    return conversations
+
+def parse_conversation(driver):
+    """
+    Extracts all messages in a conversation
+    """
+
+    for msg in lxh.fromstring(driver.page_source).xpath("//div[@class='msg']/div"):
+        data_store = loads(msg.get("data-store"))
+        msg_text = msg.text_content()
+
+        yield Message(
+                name=data_store.get("author"),
+                content=msg_text,
+                date=data_store.get("timestamp")
+              )
+
+def get_images(driver):
+    """
+    Gets all links to images in a messenger conversation
+    Removes duplicates
+    """
+    for img in set(lxh.fromstring(driver.page_source).xpath("//img")):
+        yield img.get("src")
+
+def get_convo(driver, convo):
+    """
+    Get all of the messages/images for a given conversation
+    Returns a list of messages and a list of image links
+    """
+    driver.get(convo.url)
+
+    wait_xpath(driver, "//*[contains(text(), 'See Older Messages')]")
+
+    # Expand conversation until we've reached the beginning
+    while True:
+        try:
+            see_older = driver.find_element_by_xpath("//*[contains(text(), 'See Older Messages')]")
+        except SELENIUM_EXCEPTIONS:
+            break
+
+        if not see_older:
+            break
+
+        try:
+            click_button(driver, see_older)
+        except SELENIUM_EXCEPTIONS:
+            continue
+
+    messages = list(parse_conversation(driver))
+    image_links = list(set(get_images(driver)))
+    return (messages, image_links)
+
+def delete_conversation(driver, convo):
+    """
+    Deletes a conversation
+    """
+
+    actions = ActionChains(driver)
+
+    menu_select = Select(driver.find_element_by_xpath("//select/option[contains(text(), 'Delete')]/.."))
+
+    for i, option in enumerate(menu_select.options):
+        if option.text.strip() == "Delete":
+            menu_select.select_by_index(i)
+            break
+
+    wait_xpath(driver, "//h2[contains(text(), 'Delete conversation')]")
+    delete_button = driver.find_element_by_xpath("//a[contains(text(), 'Delete')][@role='button']")
+    actions.move_to_element(delete_button).click().perform()
+
+    return
+
+def extract_convo(driver, convo):
+    """
+    Extract messages and image links from a conversation
+    Return a new Conversation instance
+    """
+    result = get_convo(driver, convo)
+
+    if not result:
+        return None
+
+    messages, image_links = result
+
+    convo.messages = messages
+    convo.image_links = image_links
+
+    return convo
+
+def traverse_conversations(driver, year=None):
+    """
+    Remove all conversations within a specified range
+    """
+
+    driver.get("https://mobile.facebook.com/messages/?pageNum=1&selectable&see_older_newer=1")
+
+    convos = get_conversations(driver)
+
+    with archiver("conversations") as archive_convo:
+        for convo in convos:
+            # If the year is set and there is a date
+            # Then we want to only look at convos from this year
+
+            if year and convo.date:
+                if convo.date.year == int(year):
+                    extract_convo(driver, convo)
+
+                    if settings["ARCHIVE"]:
+                        archive_convo.archive(convo)
+
+                    delete_conversation(driver, convo)
+
+            # Otherwise we're looking at all convos
+            elif not year:
+                extract_convo(driver, convo)
+
+                if settings["ARCHIVE"]:
+                    archive_convo.archive(convo)
+
+                delete_conversation(driver, convo)
+
--- a/deletefb/tools/wall.py
+++ b/deletefb/tools/wall.py
@ -42,7 +42,6 @@ def delete_posts(driver,
                    post_content_element = driver.find_element_by_class_name(post_content_sel)
                    post_content_ts = driver.find_element_by_class_name(post_timestamp_sel)

-
                    # Archive the post
                    archive_wall_post.archive(
                        Post(
--- a/deletefb/types.py
+++ b/deletefb/types.py
@ -1,29 +1,55 @@
 import attr
 import uuid
-import datetime
+import pendulum

-def timestamp_now():
+from datetime import datetime
+
+def convert_date(text):
    """
-    Returns: a timestamp for this instant, in ISO 8601 format
+    Tries to parse a date into a DateTime instance
+    Returns `None` if it cannot be parsed
    """
-    return datetime.datetime.isoformat(datetime.datetime.now())
+    try:
+        return pendulum.from_format(text, "DD/M/YYYY")
+    except ValueError:
+        try:
+            return (pendulum.from_format(text, "DD MMM")
+                    .set(year=pendulum.now().year))
+        except ValueError:
+            return None

 # Data type definitions of posts and comments
@attr.s
 class Post:
    content = attr.ib()
    comments = attr.ib(default=[])
-    date = attr.ib(factory=timestamp_now)
+    date = attr.ib(factory=pendulum.now)
    name = attr.ib(factory=lambda: uuid.uuid4().hex)

@attr.s
 class Comment:
    commenter = attr.ib()
    content = attr.ib()
-    date = attr.ib(factory=timestamp_now)
+    date = attr.ib(factory=pendulum.now)
    name = attr.ib(factory=lambda: uuid.uuid4().hex)

+@attr.s
+class Conversation:
+    url = attr.ib()
+    name = attr.ib()
+    date : datetime = attr.ib(converter=convert_date)
+    messages = attr.ib(default=[])
+    image_links = attr.ib(default=[])
+
+@attr.s
+class Message:
+    name = attr.ib()
+    content = attr.ib()
+
+    # Remove the last 3 digits from FB's dates. They are not standard.
+    date : datetime = attr.ib(converter=lambda t: pendulum.from_timestamp(int(str(t)[0:-3])))
+
@attr.s
 class Page:
    name = attr.ib()
-    date = attr.ib(factory=timestamp_now)
+    date = attr.ib(factory=pendulum.now)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,13 +1,26 @@
 attrs==19.1.0
 bitarray==0.9.3
+bleach==3.1.0
 certifi==2018.11.29
 chardet==3.0.4
+docutils==0.14
 idna==2.8
+lxml==4.4.0
+pendulum==2.0.5
+pkginfo==1.5.0.1
 pybloom-live==3.0.0
+Pygments==2.4.2
+python-dateutil==2.8.0
+pytzdata==2019.2
+readme-renderer==24.0
 requests==2.22.0
 requests-file==1.4.3
+requests-toolbelt==0.9.1
 selenium==3.141.0
 selenium-requests==1.3
 six==1.12.0
 tldextract==2.2.0
+tqdm==4.32.2
+twine==1.13.0
 urllib3==1.25.2
+webencodings==0.5.1
--- a/setup.py
+++ b/setup.py
@ -24,7 +24,10 @@ setuptools.setup(
        "selenium-requests",
        "requests",
        "pybloom-live",
-        "attrs"
+        "attrs",
+        "cattrs",
+        "lxml",
+        "pendulum"
    ],
    classifiers= [
        "Programming Language :: Python :: 3",