Thursday, July 26, 2012
Sunday, July 15, 2012
Search Engine, Phase I: Collecting links in a seed page
1: #A program to gather links present in a webpage and to print them.
2: import urllib2
3: pos=0
4: contents=0
5: website_1=raw_input("Enter the website")
6: def web_contents(website):
7: global contents #really important stuff.
8: web_page=urllib2.urlopen(website)
9: contents=web_page.read()
10: return contents # ican't call contents outside the function, i can only
11: #calll the function defined
12: web_contents(website_1)
13: #print contents #to access a variable outside function it should be global
14: #global means it should be declared before any function definition(line no:5)
15: #A global variable created should be edited only by using the global keyword
16: #http://stackoverflow.com/questions/423379/using-global-variables-in-a-function-other-than-the-one-that-created-them
17: def parse_contents(contents,pos):
18: i=0
19: null_pos=contents.find("<a href=")
20: start_pos=contents.find("http",null_pos)
21: end_pos=contents.find('"',start_pos)
22: url_to_be_passed=contents[start_pos:end_pos]
23: #return url_to_be_passed,end_pos
24: links=[]
25: links.append(url_to_be_passed)
26: #return links
27: url_to=[]
28: while start_pos!=-1:
29: null_pos=contents.find("<a href=",end_pos)
30: #print null_pos
31: start_pos=contents.find("http",null_pos)
32: #print start_pos
33: end_pos=contents.find('"',start_pos)
34: #print end_pos
35: #print contents[1071:1091]
36: url_to.append(contents[start_pos:end_pos])
37: #print url_to
38: #links.append(url_to_b_passed)
39: #return url_to_be_passed,end_pos
40: #i=i+1
41: #links.append(url_to)
42: i=i+1
43: links=links+url_to
44: for each in links:
45: print each
46: parse_contents(contents,pos)
47: #def extract_links(url_to_be_passed):
48: # list=[]
49: # list.append(url_to_be_passed)
Subscribe to:
Posts (Atom)