Merge pull request #72 from weskerfoot/delete-convos

Archiving and deleting messenger conversations
6 years ago · 7359747ae4
12 changed files with 317 additions and 29 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,13 @@
 ### How to contribute
 ## Dependencies
 If you are adding any new dependencies, please make sure that both `requirements.txt` and `setup.py` have been updated. Please read [this](https://caremad.io/posts/2013/07/setup-vs-requirement/) if you are confused about the difference between `requirements.txt` and the `install_requires` section.
 ## Virtualenv
 Always develop with virtualenv, as well as test with `pip install --user .`. This helps make sure implicit dependencies aren't accidentally introduced, and makes sure the average user will be more likely to run it without issues.
 ## Pull requests
 Feel free to make a pull request! Make sure to give a brief overview of what you did, and why you think it is useful. If you are fixing a specific bug or resolving an issue, then make sure to reference it in your PR.
 ## Coding style
 Try to be consistent with the existing codebase as much as possible. Things should be modularized. Don't repeat yourself if possible, but don't add needless complexity. Straightforward is often better than clever and optimized.
--- a/README.md
+++ b/README.md
@ -19,7 +19,7 @@ Personally, I did this so I would feel less attached to my Facebook profile
 ## Installation
 You have several options to run it.
-1) Install from PyPI with `pip3 install --user delete-facebook-posts`
+1) Install from PyPI with `pip3 install --user delete-facebook-posts` (recommended)
 2) Clone this repo and run `pip3 install --user .` or do `pip3 install --user
 git+https://github.com/weskerfoot/DeleteFB.git`
 3) Set up a Python virtualenv, activate it, and run `pip3 install -r requirements.txt`, then you can just run `python -m deletefb.deletefb` in the DeleteFB directory.
@ -62,14 +62,7 @@ git+https://github.com/weskerfoot/DeleteFB.git`
 * You may also pass in a code by using the `-F` argument, e.g. `-F 111111`.
 ## Delete By Year
-* The tool supports passing the `--year` flag in order to delete wall posts by
+* The tool supports passing the `--year` flag in order to delete/archive by year. E.g. `-Y 2010` would only affect posts from 2010.
  year. E.g. `-Y 2010` would delete posts from the year 2010. It is incompatible with any mode other than `wall`.
 ## Unlike Pages
 * You may use `-M unlike_pages` to unlike all of your pages. The names of the
  pages will be archived (unless archival is turned off), and this option
  conflicts with the year option. This will only unlike your *pages* that you
  have liked. It will *not* unlike anything else (like books or movies).
 ## Archival
 * The tool will archive everything being deleted by default in `.log` files.
--- a/deletefb/deletefb.log
+++ b/deletefb/deletefb.log
--- a/deletefb/deletefb.py
+++ b/deletefb/deletefb.py
@ -4,6 +4,8 @@ from .tools.config import settings
 from .tools.likes import unlike_pages
 from .tools.login import login
 from .tools.wall import delete_posts
 from .tools.conversations import traverse_conversations
 from .tools.comments import delete_comments
 import argparse
 import getpass
@ -21,7 +23,7 @@ def run_delete():
        default="wall",
        dest="mode",
        type=str,
-        choices=["wall", "unlike_pages"],
+        choices=["wall", "unlike_pages", "comments", "conversations"],
        help="The mode you want to run in. Default is `wall' which deletes wall posts"
    )
@ -91,8 +93,8 @@ def run_delete():
    settings["ARCHIVE"] = not args.archive_off
-    if args.year and args.mode != "wall":
+    if args.year and args.mode not in ("wall", "conversations"):
-        parser.error("The --year option is only supported in wall mode")
+        parser.error("The --year option is not supported in this mode")
    args_user_password = args.password or getpass.getpass('Enter your password: ')
@ -112,6 +114,13 @@ def run_delete():
    elif args.mode == "unlike_pages":
        unlike_pages(driver, args.profile_url)
    elif args.mode == "comments":
        delete_comments(driver, args.profile_url)
    elif args.mode == "conversations":
        traverse_conversations(driver, year=args.year)
    else:
        print("Please enter a valid mode")
        sys.exit(1)
--- a/deletefb/tools/archive.py
+++ b/deletefb/tools/archive.py
@ -1,13 +1,23 @@
 from .config import settings
 from contextlib import contextmanager
 from pathlib import Path
 from datetime import datetime
 from time import time
 import attr
 import cattr
 import json
 import typing
 TIME_FORMAT = "%Y-%m-%d %H:%M:%S"
 # Used to avoid duplicates in the log
 from pybloom_live import BloomFilter
 cattr.register_unstructure_hook(
    datetime, lambda dt: datetime.strftime(dt, format=TIME_FORMAT)
 )
 def make_filter():
    return BloomFilter(
        capacity=settings["MAX_POSTS"],
@ -27,10 +37,15 @@ class Archive:
        """
        Archive an object
        """
-        print("Archiving {0}".format(content))
+
        if hasattr(content, 'name'):
            print("Archiving {0}".format(content.name))
        if content.name not in self._bloom_filter:
-            self.archive_file.write(json.dumps(attr.asdict(content)) + "\n")
+            self.archive_file.write(json.dumps(cattr.unstructure(content),
                                               indent=4,
                                               sort_keys=True) + "\n")
            self._bloom_filter.add(content.name)
        return
@ -38,7 +53,7 @@ class Archive:
 def archiver(archive_type):
    archive_file = open(
-        str((Path(".") / Path(archive_type).name).with_suffix(".log")),
+        str((Path(".") / Path(archive_type).name).with_suffix(".log.{0}".format(time()))),
        mode="ta",
        buffering=1
    )
--- a/deletefb/tools/comments.py
+++ b/deletefb/tools/comments.py
@ -0,0 +1,17 @@
 from .archive import archiver
 from ..types import Comment
 from .common import SELENIUM_EXCEPTIONS, logger, click_button
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 LOG = logger(__name__)
 def delete_comments(driver, profile_url):
    """
    Remove all comments on posts
    """
    driver.get("{0}/allactivity?privacy_source=activity_log&category_key=commentscluster".format(profile_url))
    wait = WebDriverWait(driver, 20)
--- a/deletefb/tools/common.py
+++ b/deletefb/tools/common.py
@ -1,14 +1,19 @@
 from os.path import isfile
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.by import By
 from selenium.common.exceptions import (
    NoSuchElementException,
    StaleElementReferenceException,
-    TimeoutException
+    TimeoutException,
    JavascriptException
 )
 import json
 import logging
 import logging.config
 import os
 import pendulum
 SELENIUM_EXCEPTIONS = (
    NoSuchElementException,
@ -19,13 +24,18 @@ SELENIUM_EXCEPTIONS = (
 def click_button(driver, el):
    """
    Click a button using Javascript
    Args:
        driver: seleniumrequests.Chrome Driver instance
    Returns:
        None
    """
    driver.execute_script("arguments[0].click();", el)
 def scroll_to(driver, el):
    """
    Scroll an element into view, using JS
    """
    try:
        driver.execute_script("arguments[0].scrollIntoView();", el)
    except SELENIUM_EXCEPTIONS:
        return
 def logger(name):
    """
    Args:
@ -45,6 +55,17 @@ def logger(name):
        logging.config.dictConfig(config["logging"])
    return logging.getLogger(name)
 def wait_xpath(driver, expr):
    """
    Takes an XPath expression, and waits at most 20 seconds until it exists
    """
    wait = WebDriverWait(driver, 20)
    try:
        wait.until(EC.presence_of_element_located((By.XPATH, expr)))
    except SELENIUM_EXCEPTIONS:
        return
 NO_CHROME_DRIVER = """
 You need to install the chromedriver for Selenium\n
 Please see this link https://github.com/weskerfoot/DeleteFB#how-to-use-it\n
--- a/deletefb/tools/conversations.py
+++ b/deletefb/tools/conversations.py
@ -0,0 +1,179 @@
 from .archive import archiver
 from ..types import Conversation, Message
 from .common import SELENIUM_EXCEPTIONS, logger, click_button, wait_xpath
 from .config import settings
 from selenium.webdriver.common.action_chains import ActionChains
 from selenium.webdriver.support.ui import Select
 from pendulum import now
 from json import loads
 import lxml.html as lxh
 LOG = logger(__name__)
 def get_conversations(driver):
    """
    Get a list of conversations
    """
    wait_xpath(driver, "//div[@id=\"threadlist_rows\"]")
    # This function *cannot* be a generator
    # Otherwise elements will become stale
    conversations = []
    while True:
        for convo in driver.find_elements_by_xpath("//a"):
            url = convo.get_attribute("href")
            date = None
            if url and "messages/read" in url:
                date = convo.find_element_by_xpath("../../..//abbr").text
                conversation_name = convo.find_element_by_xpath("../../../div/div/header/h3").text.strip()
                assert(conversation_name)
                assert(url)
                conversations.append(
                    Conversation(
                        url=url,
                        date=date,
                        name=conversation_name
                    )
                )
        try:
            next_url = (driver.find_element_by_id("see_older_threads").
                        find_element_by_xpath("a").
                        get_attribute("href"))
        except SELENIUM_EXCEPTIONS:
            break
        if not next_url:
            break
        driver.get(next_url)
    return conversations
 def parse_conversation(driver):
    """
    Extracts all messages in a conversation
    """
    for msg in lxh.fromstring(driver.page_source).xpath("//div[@class='msg']/div"):
        data_store = loads(msg.get("data-store"))
        msg_text = msg.text_content()
        yield Message(
                name=data_store.get("author"),
                content=msg_text,
                date=data_store.get("timestamp")
              )
 def get_images(driver):
    """
    Gets all links to images in a messenger conversation
    Removes duplicates
    """
    for img in set(lxh.fromstring(driver.page_source).xpath("//img")):
        yield img.get("src")
 def get_convo(driver, convo):
    """
    Get all of the messages/images for a given conversation
    Returns a list of messages and a list of image links
    """
    driver.get(convo.url)
    wait_xpath(driver, "//*[contains(text(), 'See Older Messages')]")
    # Expand conversation until we've reached the beginning
    while True:
        try:
            see_older = driver.find_element_by_xpath("//*[contains(text(), 'See Older Messages')]")
        except SELENIUM_EXCEPTIONS:
            break
        if not see_older:
            break
        try:
            click_button(driver, see_older)
        except SELENIUM_EXCEPTIONS:
            continue
    messages = list(parse_conversation(driver))
    image_links = list(set(get_images(driver)))
    return (messages, image_links)
 def delete_conversation(driver, convo):
    """
    Deletes a conversation
    """
    actions = ActionChains(driver)
    menu_select = Select(driver.find_element_by_xpath("//select/option[contains(text(), 'Delete')]/.."))
    for i, option in enumerate(menu_select.options):
        if option.text.strip() == "Delete":
            menu_select.select_by_index(i)
            break
    wait_xpath(driver, "//h2[contains(text(), 'Delete conversation')]")
    delete_button = driver.find_element_by_xpath("//a[contains(text(), 'Delete')][@role='button']")
    actions.move_to_element(delete_button).click().perform()
    return
 def extract_convo(driver, convo):
    """
    Extract messages and image links from a conversation
    Return a new Conversation instance
    """
    result = get_convo(driver, convo)
    if not result:
        return None
    messages, image_links = result
    convo.messages = messages
    convo.image_links = image_links
    return convo
 def traverse_conversations(driver, year=None):
    """
    Remove all conversations within a specified range
    """
    driver.get("https://mobile.facebook.com/messages/?pageNum=1&selectable&see_older_newer=1")
    convos = get_conversations(driver)
    with archiver("conversations") as archive_convo:
        for convo in convos:
            # If the year is set and there is a date
            # Then we want to only look at convos from this year
            if year and convo.date:
                if convo.date.year == int(year):
                    extract_convo(driver, convo)
                    if settings["ARCHIVE"]:
                        archive_convo.archive(convo)
                    delete_conversation(driver, convo)
            # Otherwise we're looking at all convos
            elif not year:
                extract_convo(driver, convo)
                if settings["ARCHIVE"]:
                    archive_convo.archive(convo)
                delete_conversation(driver, convo)
--- a/deletefb/tools/wall.py
+++ b/deletefb/tools/wall.py
@ -42,7 +42,6 @@ def delete_posts(driver,
                    post_content_element = driver.find_element_by_class_name(post_content_sel)
                    post_content_ts = driver.find_element_by_class_name(post_timestamp_sel)
                    # Archive the post
                    archive_wall_post.archive(
                        Post(
--- a/deletefb/types.py
+++ b/deletefb/types.py
@ -1,29 +1,55 @@
 import attr
 import uuid
-import datetime
+import pendulum
-def timestamp_now():
+from datetime import datetime
 def convert_date(text):
    """
-    Returns: a timestamp for this instant, in ISO 8601 format
+    Tries to parse a date into a DateTime instance
    Returns `None` if it cannot be parsed
    """
-    return datetime.datetime.isoformat(datetime.datetime.now())
+    try:
        return pendulum.from_format(text, "DD/M/YYYY")
    except ValueError:
        try:
            return (pendulum.from_format(text, "DD MMM")
                    .set(year=pendulum.now().year))
        except ValueError:
            return None
 # Data type definitions of posts and comments
@attr.s
 class Post:
    content = attr.ib()
    comments = attr.ib(default=[])
-    date = attr.ib(factory=timestamp_now)
+    date = attr.ib(factory=pendulum.now)
    name = attr.ib(factory=lambda: uuid.uuid4().hex)
@attr.s
 class Comment:
    commenter = attr.ib()
    content = attr.ib()
-    date = attr.ib(factory=timestamp_now)
+    date = attr.ib(factory=pendulum.now)
    name = attr.ib(factory=lambda: uuid.uuid4().hex)
@attr.s
 class Conversation:
    url = attr.ib()
    name = attr.ib()
    date : datetime = attr.ib(converter=convert_date)
    messages = attr.ib(default=[])
    image_links = attr.ib(default=[])
@attr.s
 class Message:
    name = attr.ib()
    content = attr.ib()
    # Remove the last 3 digits from FB's dates. They are not standard.
    date : datetime = attr.ib(converter=lambda t: pendulum.from_timestamp(int(str(t)[0:-3])))
@attr.s
 class Page:
    name = attr.ib()
-    date = attr.ib(factory=timestamp_now)
+    date = attr.ib(factory=pendulum.now)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,13 +1,26 @@
 attrs==19.1.0
 bitarray==0.9.3
 bleach==3.1.0
 certifi==2018.11.29
 chardet==3.0.4
 docutils==0.14
 idna==2.8
 lxml==4.4.0
 pendulum==2.0.5
 pkginfo==1.5.0.1
 pybloom-live==3.0.0
 Pygments==2.4.2
 python-dateutil==2.8.0
 pytzdata==2019.2
 readme-renderer==24.0
 requests==2.22.0
 requests-file==1.4.3
 requests-toolbelt==0.9.1
 selenium==3.141.0
 selenium-requests==1.3
 six==1.12.0
 tldextract==2.2.0
 tqdm==4.32.2
 twine==1.13.0
 urllib3==1.25.2
 webencodings==0.5.1
--- a/setup.py
+++ b/setup.py
@ -24,7 +24,10 @@ setuptools.setup(
        "selenium-requests",
        "requests",
        "pybloom-live",
-        "attrs"
+        "attrs",
        "cattrs",
        "lxml",
        "pendulum"
    ],
    classifiers= [
        "Programming Language :: Python :: 3",