committed by
GitHub
12 changed files with 317 additions and 29 deletions
@ -0,0 +1,13 @@ |
|||||
|
### How to contribute |
||||
|
|
||||
|
## Dependencies |
||||
|
If you are adding any new dependencies, please make sure that both `requirements.txt` and `setup.py` have been updated. Please read [this](https://caremad.io/posts/2013/07/setup-vs-requirement/) if you are confused about the difference between `requirements.txt` and the `install_requires` section. |
||||
|
|
||||
|
## Virtualenv |
||||
|
Always develop with virtualenv, as well as test with `pip install --user .`. This helps make sure implicit dependencies aren't accidentally introduced, and makes sure the average user will be more likely to run it without issues. |
||||
|
|
||||
|
## Pull requests |
||||
|
Feel free to make a pull request! Make sure to give a brief overview of what you did, and why you think it is useful. If you are fixing a specific bug or resolving an issue, then make sure to reference it in your PR. |
||||
|
|
||||
|
## Coding style |
||||
|
Try to be consistent with the existing codebase as much as possible. Things should be modularized. Don't repeat yourself if possible, but don't add needless complexity. Straightforward is often better than clever and optimized. |
@ -0,0 +1,17 @@ |
|||||
|
from .archive import archiver |
||||
|
from ..types import Comment |
||||
|
from .common import SELENIUM_EXCEPTIONS, logger, click_button |
||||
|
from selenium.webdriver.common.by import By |
||||
|
from selenium.webdriver.support import expected_conditions as EC |
||||
|
from selenium.webdriver.support.ui import WebDriverWait |
||||
|
|
||||
|
LOG = logger(__name__) |
||||
|
|
||||
|
def delete_comments(driver, profile_url): |
||||
|
""" |
||||
|
Remove all comments on posts |
||||
|
""" |
||||
|
|
||||
|
driver.get("{0}/allactivity?privacy_source=activity_log&category_key=commentscluster".format(profile_url)) |
||||
|
|
||||
|
wait = WebDriverWait(driver, 20) |
@ -0,0 +1,179 @@ |
|||||
|
from .archive import archiver |
||||
|
from ..types import Conversation, Message |
||||
|
from .common import SELENIUM_EXCEPTIONS, logger, click_button, wait_xpath |
||||
|
from .config import settings |
||||
|
from selenium.webdriver.common.action_chains import ActionChains |
||||
|
from selenium.webdriver.support.ui import Select |
||||
|
from pendulum import now |
||||
|
from json import loads |
||||
|
|
||||
|
import lxml.html as lxh |
||||
|
|
||||
|
LOG = logger(__name__) |
||||
|
|
||||
|
def get_conversations(driver): |
||||
|
""" |
||||
|
Get a list of conversations |
||||
|
""" |
||||
|
|
||||
|
wait_xpath(driver, "//div[@id=\"threadlist_rows\"]") |
||||
|
|
||||
|
# This function *cannot* be a generator |
||||
|
# Otherwise elements will become stale |
||||
|
conversations = [] |
||||
|
|
||||
|
while True: |
||||
|
for convo in driver.find_elements_by_xpath("//a"): |
||||
|
url = convo.get_attribute("href") |
||||
|
|
||||
|
date = None |
||||
|
|
||||
|
if url and "messages/read" in url: |
||||
|
|
||||
|
date = convo.find_element_by_xpath("../../..//abbr").text |
||||
|
conversation_name = convo.find_element_by_xpath("../../../div/div/header/h3").text.strip() |
||||
|
|
||||
|
assert(conversation_name) |
||||
|
assert(url) |
||||
|
|
||||
|
conversations.append( |
||||
|
Conversation( |
||||
|
url=url, |
||||
|
date=date, |
||||
|
name=conversation_name |
||||
|
) |
||||
|
) |
||||
|
|
||||
|
try: |
||||
|
next_url = (driver.find_element_by_id("see_older_threads"). |
||||
|
find_element_by_xpath("a"). |
||||
|
get_attribute("href")) |
||||
|
|
||||
|
except SELENIUM_EXCEPTIONS: |
||||
|
break |
||||
|
if not next_url: |
||||
|
break |
||||
|
driver.get(next_url) |
||||
|
|
||||
|
return conversations |
||||
|
|
||||
|
def parse_conversation(driver): |
||||
|
""" |
||||
|
Extracts all messages in a conversation |
||||
|
""" |
||||
|
|
||||
|
for msg in lxh.fromstring(driver.page_source).xpath("//div[@class='msg']/div"): |
||||
|
data_store = loads(msg.get("data-store")) |
||||
|
msg_text = msg.text_content() |
||||
|
|
||||
|
yield Message( |
||||
|
name=data_store.get("author"), |
||||
|
content=msg_text, |
||||
|
date=data_store.get("timestamp") |
||||
|
) |
||||
|
|
||||
|
def get_images(driver): |
||||
|
""" |
||||
|
Gets all links to images in a messenger conversation |
||||
|
Removes duplicates |
||||
|
""" |
||||
|
for img in set(lxh.fromstring(driver.page_source).xpath("//img")): |
||||
|
yield img.get("src") |
||||
|
|
||||
|
def get_convo(driver, convo): |
||||
|
""" |
||||
|
Get all of the messages/images for a given conversation |
||||
|
Returns a list of messages and a list of image links |
||||
|
""" |
||||
|
driver.get(convo.url) |
||||
|
|
||||
|
wait_xpath(driver, "//*[contains(text(), 'See Older Messages')]") |
||||
|
|
||||
|
# Expand conversation until we've reached the beginning |
||||
|
while True: |
||||
|
try: |
||||
|
see_older = driver.find_element_by_xpath("//*[contains(text(), 'See Older Messages')]") |
||||
|
except SELENIUM_EXCEPTIONS: |
||||
|
break |
||||
|
|
||||
|
if not see_older: |
||||
|
break |
||||
|
|
||||
|
try: |
||||
|
click_button(driver, see_older) |
||||
|
except SELENIUM_EXCEPTIONS: |
||||
|
continue |
||||
|
|
||||
|
messages = list(parse_conversation(driver)) |
||||
|
image_links = list(set(get_images(driver))) |
||||
|
return (messages, image_links) |
||||
|
|
||||
|
def delete_conversation(driver, convo): |
||||
|
""" |
||||
|
Deletes a conversation |
||||
|
""" |
||||
|
|
||||
|
actions = ActionChains(driver) |
||||
|
|
||||
|
menu_select = Select(driver.find_element_by_xpath("//select/option[contains(text(), 'Delete')]/..")) |
||||
|
|
||||
|
for i, option in enumerate(menu_select.options): |
||||
|
if option.text.strip() == "Delete": |
||||
|
menu_select.select_by_index(i) |
||||
|
break |
||||
|
|
||||
|
wait_xpath(driver, "//h2[contains(text(), 'Delete conversation')]") |
||||
|
delete_button = driver.find_element_by_xpath("//a[contains(text(), 'Delete')][@role='button']") |
||||
|
actions.move_to_element(delete_button).click().perform() |
||||
|
|
||||
|
return |
||||
|
|
||||
|
def extract_convo(driver, convo): |
||||
|
""" |
||||
|
Extract messages and image links from a conversation |
||||
|
Return a new Conversation instance |
||||
|
""" |
||||
|
result = get_convo(driver, convo) |
||||
|
|
||||
|
if not result: |
||||
|
return None |
||||
|
|
||||
|
messages, image_links = result |
||||
|
|
||||
|
convo.messages = messages |
||||
|
convo.image_links = image_links |
||||
|
|
||||
|
return convo |
||||
|
|
||||
|
def traverse_conversations(driver, year=None): |
||||
|
""" |
||||
|
Remove all conversations within a specified range |
||||
|
""" |
||||
|
|
||||
|
driver.get("https://mobile.facebook.com/messages/?pageNum=1&selectable&see_older_newer=1") |
||||
|
|
||||
|
convos = get_conversations(driver) |
||||
|
|
||||
|
with archiver("conversations") as archive_convo: |
||||
|
for convo in convos: |
||||
|
# If the year is set and there is a date |
||||
|
# Then we want to only look at convos from this year |
||||
|
|
||||
|
if year and convo.date: |
||||
|
if convo.date.year == int(year): |
||||
|
extract_convo(driver, convo) |
||||
|
|
||||
|
if settings["ARCHIVE"]: |
||||
|
archive_convo.archive(convo) |
||||
|
|
||||
|
delete_conversation(driver, convo) |
||||
|
|
||||
|
# Otherwise we're looking at all convos |
||||
|
elif not year: |
||||
|
extract_convo(driver, convo) |
||||
|
|
||||
|
if settings["ARCHIVE"]: |
||||
|
archive_convo.archive(convo) |
||||
|
|
||||
|
delete_conversation(driver, convo) |
||||
|
|
@ -1,29 +1,55 @@ |
|||||
import attr |
import attr |
||||
import uuid |
import uuid |
||||
import datetime |
import pendulum |
||||
|
|
||||
def timestamp_now(): |
from datetime import datetime |
||||
|
|
||||
|
def convert_date(text): |
||||
""" |
""" |
||||
Returns: a timestamp for this instant, in ISO 8601 format |
Tries to parse a date into a DateTime instance |
||||
|
Returns `None` if it cannot be parsed |
||||
""" |
""" |
||||
return datetime.datetime.isoformat(datetime.datetime.now()) |
try: |
||||
|
return pendulum.from_format(text, "DD/M/YYYY") |
||||
|
except ValueError: |
||||
|
try: |
||||
|
return (pendulum.from_format(text, "DD MMM") |
||||
|
.set(year=pendulum.now().year)) |
||||
|
except ValueError: |
||||
|
return None |
||||
|
|
||||
# Data type definitions of posts and comments |
# Data type definitions of posts and comments |
||||
@attr.s |
@attr.s |
||||
class Post: |
class Post: |
||||
content = attr.ib() |
content = attr.ib() |
||||
comments = attr.ib(default=[]) |
comments = attr.ib(default=[]) |
||||
date = attr.ib(factory=timestamp_now) |
date = attr.ib(factory=pendulum.now) |
||||
name = attr.ib(factory=lambda: uuid.uuid4().hex) |
name = attr.ib(factory=lambda: uuid.uuid4().hex) |
||||
|
|
||||
@attr.s |
@attr.s |
||||
class Comment: |
class Comment: |
||||
commenter = attr.ib() |
commenter = attr.ib() |
||||
content = attr.ib() |
content = attr.ib() |
||||
date = attr.ib(factory=timestamp_now) |
date = attr.ib(factory=pendulum.now) |
||||
name = attr.ib(factory=lambda: uuid.uuid4().hex) |
name = attr.ib(factory=lambda: uuid.uuid4().hex) |
||||
|
|
||||
|
@attr.s |
||||
|
class Conversation: |
||||
|
url = attr.ib() |
||||
|
name = attr.ib() |
||||
|
date : datetime = attr.ib(converter=convert_date) |
||||
|
messages = attr.ib(default=[]) |
||||
|
image_links = attr.ib(default=[]) |
||||
|
|
||||
|
@attr.s |
||||
|
class Message: |
||||
|
name = attr.ib() |
||||
|
content = attr.ib() |
||||
|
|
||||
|
# Remove the last 3 digits from FB's dates. They are not standard. |
||||
|
date : datetime = attr.ib(converter=lambda t: pendulum.from_timestamp(int(str(t)[0:-3]))) |
||||
|
|
||||
@attr.s |
@attr.s |
||||
class Page: |
class Page: |
||||
name = attr.ib() |
name = attr.ib() |
||||
date = attr.ib(factory=timestamp_now) |
date = attr.ib(factory=pendulum.now) |
||||
|
@ -1,13 +1,26 @@ |
|||||
attrs==19.1.0 |
attrs==19.1.0 |
||||
bitarray==0.9.3 |
bitarray==0.9.3 |
||||
|
bleach==3.1.0 |
||||
certifi==2018.11.29 |
certifi==2018.11.29 |
||||
chardet==3.0.4 |
chardet==3.0.4 |
||||
|
docutils==0.14 |
||||
idna==2.8 |
idna==2.8 |
||||
|
lxml==4.4.0 |
||||
|
pendulum==2.0.5 |
||||
|
pkginfo==1.5.0.1 |
||||
pybloom-live==3.0.0 |
pybloom-live==3.0.0 |
||||
|
Pygments==2.4.2 |
||||
|
python-dateutil==2.8.0 |
||||
|
pytzdata==2019.2 |
||||
|
readme-renderer==24.0 |
||||
requests==2.22.0 |
requests==2.22.0 |
||||
requests-file==1.4.3 |
requests-file==1.4.3 |
||||
|
requests-toolbelt==0.9.1 |
||||
selenium==3.141.0 |
selenium==3.141.0 |
||||
selenium-requests==1.3 |
selenium-requests==1.3 |
||||
six==1.12.0 |
six==1.12.0 |
||||
tldextract==2.2.0 |
tldextract==2.2.0 |
||||
|
tqdm==4.32.2 |
||||
|
twine==1.13.0 |
||||
urllib3==1.25.2 |
urllib3==1.25.2 |
||||
|
webencodings==0.5.1 |
||||
|
Loading…
Reference in new issue