Python错误：找到具有0个示例（shape=（0262））的数组，但至少需要1个

from collections import Counter import pandas from nltk.corpus import stopwords import pandas as pd import numpy headlines = [] apps = pd.read_csv('DataUse.csv') for e in apps['title_lower']: headlines.append(e) testdata = pd.read_csv('testdata.csv') # Find all the unique words in the headlines. unique_words = list(set(" ".join(headlines).split(" "))) def make_matrix(headlines, vocab): matrix = [] for headline in headlines: # Count each word in the headline, and make a dictionary. counter = Counter(headline) # Turn the dictionary into a matrix row using the vocab. row = [counter.get(w, 0) for w in vocab] matrix.append(row) df = pandas.DataFrame(matrix) df.columns = unique_words return df print(make_matrix(headlines, unique_words)) import re # Lowercase, then replace any non-letter, space, or digit character in the headlines. new_headlines = [re.sub(r'[^\w\s\d]','',h.lower()) for h in headlines] # Replace sequences of whitespace with a space character. new_headlines = [re.sub("\s+", " ", h) for h in new_headlines] unique_words = list(set(" ".join(new_headlines).split(" "))) # We've reduced the number of columns in the matrix a bit. print(make_matrix(new_headlines, unique_words)) stopwords = set(stopwords.words('english')) stopwords = [re.sub(r'[^\w\s\d]','',s.lower()) for s in stopwords] unique_words = list(set(" ".join(new_headlines).split(" "))) # Remove stopwords from the vocabulary. unique_words = [w for w in unique_words if w not in stopwords] # We're down to 34 columns, which is way better! print(make_matrix(new_headlines, unique_words)) ## ## ## ## from sklearn.feature_extraction.text import CountVectorizer # Construct a bag of words matrix. # This will lowercase everything, and ignore all punctuation by default. # It will also remove stop words. vectorizer = CountVectorizer(lowercase=True, stop_words="english") matrix = vectorizer.fit_transform(headlines) # We created our bag of words matrix with far fewer commands. print(matrix.todense()) # Let's apply the same method to all the headlines in all 100000 submissions. # We'll also add the url of the submission to the end of the headline so we can take it into account. full_matrix = vectorizer.fit_transform(apps['title_lower']) print(full_matrix.shape) ## ## ## ## ## from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 # Convert the upvotes variable to binary so it works with a chi-squared test. col = apps["total_shares"].copy(deep=True) col_mean = col.mean() col[col < col_mean] = 0 col[(col > 0) & (col > col_mean)] = 1 print col # Find the 1000 most informative columns selector = SelectKBest(chi2, k='all') selector.fit(full_matrix, col) top_words = selector.get_support().nonzero() # Pick only the most informative columns in the data. chi_matrix = full_matrix[:,top_words[0]] ## ## ## ## ## ## import numpy as numpy transform_functions = [ lambda x: len(x), lambda x: x.count(" "), lambda x: x.count("."), lambda x: x.count("!"), lambda x: x.count("?"), lambda x: len(x) / (x.count(" ") + 1), lambda x: x.count(" ") / (x.count(".") + 1), lambda x: len(re.findall("\d", x)), lambda x: len(re.findall("[A-Z]", x)), ] # Apply each function and put the results into a list. columns = [] for func in transform_functions: columns.append(apps["title_lower"].apply(func)) # Convert the meta features to a numpy array. meta = numpy.asarray(columns).T ## ## ## ## ## ## ## features = numpy.hstack([chi_matrix.todense()]) from sklearn.linear_model import Ridge import random train_rows = 262 # Set a seed to get the same "random" shuffle every time. random.seed(1) # Shuffle the indices for the matrix. indices = list(range(features.shape[0])) random.shuffle(indices) # Create train and test sets. train = features[indices[:train_rows], :] test = features[indices[train_rows:], :] print test train_upvotes = apps['total_shares'].iloc[indices[:train_rows]] test_upvotes = apps['total_shares'].iloc[indices[train_rows:]] train = numpy.nan_to_num(train) print (test) # Run the regression and generate predictions for the test set. reg = Ridge(alpha=.1) reg.fit(train, train_upvotes) predictions = reg.predict(test) ## ## ## ## ## ### We're going to use mean absolute error as an error metric. ### Our error is about 13.6 upvotes, which means that, on average, ### our prediction is 13.6 upvotes away from the actual number of upvotes. ##print(sum(abs(predictions - test_upvotes)) / len(predictions)) ## ### As a baseline, we'll use the average number of upvotes ### across all submissions. ### The error here is 17.2 -- our estimate is better, but not hugely so. ### There either isn't a ton of predictive value encoded in the ### data we have, or we aren't extracting it well. ##average_upvotes = sum(test_upvotes)/len(test_upvotes) ##print(sum(abs(average_upvotes - test_upvotes)) / len(predictions)) ##

Traceback (most recent call last): File "C:/Users/Tucker Siegel/Desktop/Machines/Test.py", line 156, in <module> predictions = reg.predict(test) File "C:\Python27\lib\site-packages\sklearn\linear_model\base.py", line 200, in predict return self._decision_function(X) File "C:\Python27\lib\site-packages\sklearn\linear_model\base.py", line 183, in _decision_function X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) File "C:\Python27\lib\site-packages\sklearn\utils\validation.py", line 407, in check_array context)) ValueError: Found array with 0 sample(s) (shape=(0, 262)) while a minimum of 1 is required.

0条回答

目前没有回答

相关问题更多 >

编程相关推荐

热门问题

热门文章