Skip to content

What is the length of the Bertopic default dataset from sklearn?


from sklearn.datasets import fetch_20newsgroups

docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

len_docs = []
for doc in docs:
    len_docs.append(len(doc))
print("Num of docs")
print(len(docs))
print("Average Length of doc")
print(sum(len_docs) / len(len_docs))