This is meant to be a comment for @EuanG, but given the space limits in the "comment section", I write it here.
I tried to wrap up what you wrote in a "def", but it is not really working... I guess something is missing... but I am not able to figure out what..
import re
def remove_references(text):
# Define regex patterns
numeric_pattern = r"\[\d+\]|\(\d+\)"
author_year_pattern = r"\b[A-Z][a-z]+ et al\., \d{4}|\(\w+, \d{4}\)"
doi_url_pattern = r"\bdoi:|http[s]?://\S+"
sequential_numeric_pattern = r"\[\d+(,\s*\d+)*\]|\(\d+(,\s*\d+)*\)"
journal_style_pattern = r"[A-Za-z\s]+, \d{1,4}\([\d\-]+\):\d+\-\d+"
common_references_pattern = r"^\[\d+\]|\(\d+\)|\b[A-Z][a-z]+ et al\., \d{4}|\(\w+, \d{4}\)|doi:|http[s]?://"
# Remove the references from the text using regex
cleaned_text = re.sub(common_references_pattern, "", text)
return cleaned_text