import urllib2import BeautifulSoupimport stringbadwords = set([ 'cup','cups', 'clove','cloves', 'tsp','teaspoon','teaspoons', 'tbsp','tablespoon','tablespoons', 'minced'])def cleanIngred(s): # remove leading and trailing whitespace s = s.strip() # remove numbers and punctuation in the string s = s.strip(string.digits + string.punctuation) # remove unwanted words return ' '.join(word for word in s.split() if not word in badwords)def main(): url = "http://allrecipes.com/Recipe/Slow-Cooker-Pork-Chops-II/Detail.aspx" data = urllib2.urlopen(url).read() bs = BeautifulSoup.BeautifulSoup(data) ingreds = bs.find('div', {'class': 'ingredients'}) ingreds = [cleanIngred(s.getText()) for s in ingreds.findAll('li')] fname = 'PorkRecipe.txt' with open(fname, 'w') as outf: outf.write('n'.join(ingreds))if __name__=="__main__": main()
结果是
olive oilchicken brothgarlic,paprikagarlic powderpoultry seasoningdried oreganodried basilthick cut boneless pork chopssalt and pepper to taste
?我不知道为什么它在其中留下了逗号-s.strip(string.punctuation)应该已经解决了。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)