Automate Scrubbing your Facebook Presence
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

177 lines
4.8 KiB

from .archive import archiver
from ..types import Conversation, Message
from .common import SELENIUM_EXCEPTIONS, logger, click_button
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from pendulum import now
from json import loads
import lxml.html as lxh
LOG = logger(__name__)
def get_conversations(driver):
"""
Get a list of conversations
"""
actions = ActionChains(driver)
wait = WebDriverWait(driver, 20)
try:
wait.until(
EC.presence_of_element_located((By.XPATH, "//div[@id=\"threadlist_rows\"]"))
)
except SELENIUM_EXCEPTIONS:
LOG.exception("No conversations")
return
# This function *cannot* be a generator
# Otherwise elements will become stale
conversations = []
while True:
for convo in driver.find_elements_by_xpath("//a"):
url = convo.get_attribute("href")
date = None
if url and "messages/read" in url:
date = convo.find_element_by_xpath("../../..//abbr").text
conversation_name = convo.find_element_by_xpath("../../../div/div/header/h3").text.strip()
assert(conversation_name)
assert(url)
conversations.append(
Conversation(
url=url,
date=date,
name=conversation_name
)
)
try:
next_url = (driver.find_element_by_id("see_older_threads").
find_element_by_xpath("a").
get_attribute("href"))
except SELENIUM_EXCEPTIONS:
break
if not next_url:
break
driver.get(next_url)
return conversations
def parse_conversation(driver):
"""
Extracts all messages in a conversation
"""
for msg in lxh.fromstring(driver.page_source).xpath("//div[@class='msg']/div"):
data_store = loads(msg.get("data-store"))
msg_text = msg.text_content()
yield Message(
name=data_store.get("author"),
content=msg_text,
date=data_store.get("timestamp")
)
def get_images(driver):
"""
Gets all links to images in a messenger conversation
Removes duplicates
"""
for img in set(lxh.fromstring(driver.page_source).xpath("//img")):
yield img.get("src")
def get_convo(driver, convo):
"""
Get all of the messages/images for a given conversation
Returns a list of messages and a list of image links
"""
driver.get(convo.url)
wait = WebDriverWait(driver, 20)
try:
wait.until(
EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'See Older Messages')]"))
)
except SELENIUM_EXCEPTIONS:
LOG.exception("Could not load more messages")
return
# Expand conversation until we've reached the beginning
while True:
try:
see_older = driver.find_element_by_xpath("//*[contains(text(), 'See Older Messages')]")
except SELENIUM_EXCEPTIONS:
break
if not see_older:
break
try:
click_button(driver, see_older)
except SELENIUM_EXCEPTIONS:
continue
messages = list(parse_conversation(driver))
image_links = list(set(get_images(driver)))
return (messages, image_links)
def delete_conversation(driver, convo):
"""
Deletes a conversation
"""
return
def extract_convo(driver, convo):
"""
Extract messages and image links from a conversation
Return a new Conversation instance
"""
result = get_convo(driver, convo)
if not result:
return None
messages, image_links = result
convo.messages = messages
convo.image_links = image_links
return convo
def traverse_conversations(driver, year=None):
"""
Remove all conversations within a specified range
"""
driver.get("https://mobile.facebook.com/messages/?pageNum=1&selectable&see_older_newer=1")
convos = get_conversations(driver)
with archiver("conversations") as archive_convo:
for convo in convos:
# If the year is set and there is a date
# Then we want to only look at convos from this year
if year and convo.date:
if convo.date.year == int(year):
extract_convo(driver, convo)
archive_convo.archive(convo)
# Otherwise we're looking at all convos
elif not year:
extract_convo(driver, convo)
archive_convo.archive(convo)