Python 类型错误'<';在';非类型';和';str';-管道预测

Python 类型错误'<';在';非类型';和';str';-管道预测,python,pandas,machine-learning,scikit-learn,spacy,Python,Pandas,Machine Learning,Scikit Learn,Spacy,我正在尝试使用SPACY2.3.5和最新版本的Python对字符串进行情绪分析。这是我正在使用的代码: # Use the punctuations of string module import string # Pandas library to create our model from reviews txt files import pandas as pd # Spacy library to lemmatize and tokenize our reviews import s

我正在尝试使用SPACY2.3.5和最新版本的Python对字符串进行情绪分析。这是我正在使用的代码:

# Use the punctuations of string module
import string

# Pandas library to create our model from reviews txt files
import pandas as pd

# Spacy library to lemmatize and tokenize our reviews
import spacy

# ML Packages
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

# Splitting Data Set
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer

# Creating a Spacy Parser in english language
import en_core_web_sm
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.linear_model import SGDClassifier

# Load our dataset
# Renaming Column Headers Message and Target (1 == Is Positive, 0 == Is Negative)
df_yelp = spark.read.option("delimiter", "\t").csv('/FileStore/tables/yelp_labelled.txt').withColumnRenamed('_c0', 'Message').withColumnRenamed('_c1','Target').select("*").toPandas()
df_imdb = spark.read.option("delimiter", "\t").csv('/FileStore/tables/imdb_labelled.txt').withColumnRenamed('_c0', 'Message').withColumnRenamed('_c1','Target').select("*").toPandas()
df_amz = spark.read.option("delimiter", "\t").csv('/FileStore/tables/amazon_cells_labelled.txt').withColumnRenamed('_c0', 'Message').withColumnRenamed('_c1','Target').select("*").toPandas()

# Concatenate our Datasets
frames = [df_yelp, df_imdb, df_amz]


# Assign a Key to Make it Easier
keys = ['Yelp', 'IMDB', 'Amazon']

# Merge or Concat our Datasets
df = pd.concat(frames, keys=keys)

# We than create our csv file with our column (SOURCE NAME, NUMBER, MESSAGE, VALUE)
df.to_csv("sentimentdataset.csv")

nlp = en_core_web_sm.load()

stopwords = list(STOP_WORDS)

punctuations = string.punctuation

# We will use an english parser for the sentences
parser = English()

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations]
    return mytokens

# Basic function to clean the text
def clean_text(text):
    return text.strip().lower()
  
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self


# CountVectorizer implements both tokenization and occurrence counting in a single class:
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1, 1))

# We use a linear algorithm classification
classifier = LinearSVC()

# Features Column Name. This is the column with the message
X = df['Message']

# This is the column name with the value 1 or 0, Positive or Negative
ylabels = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

# Create the  pipeline to clean, tokenize, vectorize, and classify
pipe = Pipeline([("cleaner", predictors()), ('vectorizer', vectorizer), ('classifier', classifier)])

# Fit our data because we have to unify the trained data column together
pipe.fit(X_train, y_train)

# NOW WE CAN START TO PREDICT THE SENTIMENT OF THE PEOPLE WHO WRITE REVIEWS

# Another random review
example = ["Test string"]

print(pipe.predict(example))

在对pipe.fit(X\u train,y\u train)行上的分类器进行训练期间,我收到以下错误:“TypeError:”我的最佳猜测是,在执行文本预处理时,删除了某些行(有时某些行包含所有要删除的单词)因此,您的X_列的行数比y_列的行数少(这些行将显示为无),这将转换为您收到的非类型vs.str错误。

是否
np.unique(y_列)
以相同的消息失败?
y_列的内容是什么?@BenReiniger是,
np.unique(y_列)
返回相同的错误。y_train的内容如下:Yelp 642 1 700 0 226 1 IMDB 697 0 10 1..638 1 95 1 130 0 294 1 Yelp 860 1 Name:Target,Length:2400,dtype:object可能有些条目实际上丢失了(
np.nan
None
),导致错误。但是这些条目都很奇怪;你确定它们有意义吗?数字代表什么?你的目标感觉不正确。如果你在做分类,这些应该是类,而不是
Yelp 642 1 700 0 226 1 IMDB 697 0 10 1..638 1 95 1 130 0 294 1 Yelp 860 1
请用你的信息更新你的问题n目标列或链接文件谢谢你的回答。你能提供一个如何做的例子吗?我要做的是看看你的spacy_标记器函数。在这里你可以执行删除某些单词的操作。我会在你的文本语料库(X_train)上应用该函数看看最终结果是否与原始X_序列形状相同。我还看到y_序列包含数字和字符串。你能指定y_序列的结构吗?它是向量还是矩阵?
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<command-1852731017614759> in <module>
     93 
     94 # Fit our data because we have to unify the trained data column together
---> 95 pipe.fit(X_train, y_train)
     96 
     97 # NOW WE CAN START TO PREDICT THE SENTIMENT OF THE PEOPLE WHO WRITE REVIEWS

/databricks/python/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    333             if self._final_estimator != 'passthrough':
    334                 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 335                 self._final_estimator.fit(Xt, y, **fit_params_last_step)
    336 
    337         return self

/databricks/python/lib/python3.8/site-packages/sklearn/svm/_classes.py in fit(self, X, y, sample_weight)
    228                                    dtype=np.float64, order="C",
    229                                    accept_large_sparse=False)
--> 230         check_classification_targets(y)
    231         self.classes_ = np.unique(y)
    232 

/databricks/python/lib/python3.8/site-packages/sklearn/utils/multiclass.py in check_classification_targets(y)
    167     y : array-like
    168     """
--> 169     y_type = type_of_target(y)
    170     if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
    171                       'multilabel-indicator', 'multilabel-sequences']:

/databricks/python/lib/python3.8/site-packages/sklearn/utils/multiclass.py in type_of_target(y)
    288         return 'continuous' + suffix
    289 
--> 290     if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
    291         return 'multiclass' + suffix  # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
    292     else:

<__array_function__ internals> in unique(*args, **kwargs)

/databricks/python/lib/python3.8/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
    259     ar = np.asanyarray(ar)
    260     if axis is None:
--> 261         ret = _unique1d(ar, return_index, return_inverse, return_counts)
    262         return _unpack_tuple(ret)
    263 

/databricks/python/lib/python3.8/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
    320         aux = ar[perm]
    321     else:
--> 322         ar.sort()
    323         aux = ar
    324     mask = np.empty(aux.shape, dtype=np.bool_)